Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/utils.cpp
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #include "utils.h"
11 
12 #include <cstdio>
13 #include <cassert>
14 #include <cstring>
15 #include <cmath>
16 
17 #include <sys/time.h>
18 #include <sys/types.h>
19 #include <unistd.h>
20 
21 #include <omp.h>
22 
23 #include <immintrin.h>
24 
25 #include <algorithm>
26 #include <vector>
27 
28 #include "AuxIndexStructures.h"
29 #include "FaissAssert.h"
30 
31 
32 
33 #ifndef FINTEGER
34 #define FINTEGER long
35 #endif
36 
37 
38 extern "C" {
39 
40 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
41 
42 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
43  n, FINTEGER *k, const float *alpha, const float *a,
44  FINTEGER *lda, const float *b, FINTEGER *
45  ldb, float *beta, float *c, FINTEGER *ldc);
46 
47 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
48 
49 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
50  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
51 
52 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
53  FINTEGER *lda, float *tau, float *work,
54  FINTEGER *lwork, FINTEGER *info);
55 
56 int sgemv_(const char *trans, FINTEGER *m, FINTEGER *n, float *alpha,
57  const float *a, FINTEGER *lda, const float *x, FINTEGER *incx,
58  float *beta, float *y, FINTEGER *incy);
59 
60 }
61 
62 
63 /**************************************************
64  * Get some stats about the system
65  **************************************************/
66 
67 namespace faiss {
68 
69 double getmillisecs () {
70  struct timeval tv;
71  gettimeofday (&tv, nullptr);
72  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
73 }
74 
75 
76 #ifdef __linux__
77 
78 size_t get_mem_usage_kb ()
79 {
80  int pid = getpid ();
81  char fname[256];
82  snprintf (fname, 256, "/proc/%d/status", pid);
83  FILE * f = fopen (fname, "r");
84  FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
85  size_t sz = 0;
86  for (;;) {
87  char buf [256];
88  if (!fgets (buf, 256, f)) break;
89  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
90  }
91  fclose (f);
92  return sz;
93 }
94 
95 #elif __APPLE__
96 
97 size_t get_mem_usage_kb ()
98 {
99  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
100  return 0;
101 }
102 
103 #endif
104 
105 
106 
107 /**************************************************
108  * Random data generation functions
109  **************************************************/
110 
111 RandomGenerator::RandomGenerator (long seed)
112  : mt((unsigned int)seed) {}
113 
114 int RandomGenerator::rand_int ()
115 {
116  return mt() & 0x7fffffff;
117 }
118 
119 long RandomGenerator::rand_long ()
120 {
121  return long(rand_int()) | long(rand_int()) << 31;
122 }
123 
124 int RandomGenerator::rand_int (int max)
125 {
126  return mt() % max;
127 }
128 
129 float RandomGenerator::rand_float ()
130 {
131  return mt() / float(mt.max());
132 }
133 
134 double RandomGenerator::rand_double ()
135 {
136  return mt() / double(mt.max());
137 }
138 
139 
140 /***********************************************************************
141  * Random functions in this C file only exist because Torch
142  * counterparts are slow and not multi-threaded. Typical use is for
143  * more than 1-100 billion values. */
144 
145 
146 /* Generate a set of random floating point values such that x[i] in [0,1]
147  multi-threading. For this reason, we rely on re-entreant functions. */
148 void float_rand (float * x, size_t n, long seed)
149 {
150  // only try to parallelize on large enough arrays
151  const size_t nblock = n < 1024 ? 1 : 1024;
152 
153  RandomGenerator rng0 (seed);
154  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
155 
156 #pragma omp parallel for
157  for (size_t j = 0; j < nblock; j++) {
158 
159  RandomGenerator rng (a0 + j * b0);
160 
161  const size_t istart = j * n / nblock;
162  const size_t iend = (j + 1) * n / nblock;
163 
164  for (size_t i = istart; i < iend; i++)
165  x[i] = rng.rand_float ();
166  }
167 }
168 
169 
170 void float_randn (float * x, size_t n, long seed)
171 {
172  // only try to parallelize on large enough arrays
173  const size_t nblock = n < 1024 ? 1 : 1024;
174 
175  RandomGenerator rng0 (seed);
176  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
177 
178 #pragma omp parallel for
179  for (size_t j = 0; j < nblock; j++) {
180  RandomGenerator rng (a0 + j * b0);
181 
182  double a = 0, b = 0, s = 0;
183  int state = 0; /* generate two number per "do-while" loop */
184 
185  const size_t istart = j * n / nblock;
186  const size_t iend = (j + 1) * n / nblock;
187 
188  for (size_t i = istart; i < iend; i++) {
189  /* Marsaglia's method (see Knuth) */
190  if (state == 0) {
191  do {
192  a = 2.0 * rng.rand_double () - 1;
193  b = 2.0 * rng.rand_double () - 1;
194  s = a * a + b * b;
195  } while (s >= 1.0);
196  x[i] = a * sqrt(-2.0 * log(s) / s);
197  }
198  else
199  x[i] = b * sqrt(-2.0 * log(s) / s);
200  state = 1 - state;
201  }
202  }
203 }
204 
205 
206 /* Integer versions */
207 void long_rand (long * x, size_t n, long seed)
208 {
209  // only try to parallelize on large enough arrays
210  const size_t nblock = n < 1024 ? 1 : 1024;
211 
212  RandomGenerator rng0 (seed);
213  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
214 
215 #pragma omp parallel for
216  for (size_t j = 0; j < nblock; j++) {
217 
218  RandomGenerator rng (a0 + j * b0);
219 
220  const size_t istart = j * n / nblock;
221  const size_t iend = (j + 1) * n / nblock;
222  for (size_t i = istart; i < iend; i++)
223  x[i] = rng.rand_long ();
224  }
225 }
226 
227 
228 
229 void rand_perm (int *perm, size_t n, long seed)
230 {
231  for (size_t i = 0; i < n; i++) perm[i] = i;
232 
233  RandomGenerator rng (seed);
234 
235  for (size_t i = 0; i + 1 < n; i++) {
236  int i2 = i + rng.rand_int (n - i);
237  std::swap(perm[i], perm[i2]);
238  }
239 }
240 
241 
242 
243 
244 void byte_rand (uint8_t * x, size_t n, long seed)
245 {
246  // only try to parallelize on large enough arrays
247  const size_t nblock = n < 1024 ? 1 : 1024;
248 
249  RandomGenerator rng0 (seed);
250  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
251 
252 #pragma omp parallel for
253  for (size_t j = 0; j < nblock; j++) {
254 
255  RandomGenerator rng (a0 + j * b0);
256 
257  const size_t istart = j * n / nblock;
258  const size_t iend = (j + 1) * n / nblock;
259 
260  size_t i;
261  for (i = istart; i < iend; i++)
262  x[i] = rng.rand_long ();
263  }
264 }
265 
266 
267 
268 void reflection (const float * __restrict u,
269  float * __restrict x,
270  size_t n, size_t d, size_t nu)
271 {
272  size_t i, j, l;
273  for (i = 0; i < n; i++) {
274  const float * up = u;
275  for (l = 0; l < nu; l++) {
276  float ip1 = 0, ip2 = 0;
277 
278  for (j = 0; j < d; j+=2) {
279  ip1 += up[j] * x[j];
280  ip2 += up[j+1] * x[j+1];
281  }
282  float ip = 2 * (ip1 + ip2);
283 
284  for (j = 0; j < d; j++)
285  x[j] -= ip * up[j];
286  up += d;
287  }
288  x += d;
289  }
290 }
291 
292 
293 /* Reference implementation (slower) */
294 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
295 {
296  size_t i, j, l;
297  for (i = 0; i < n; i++) {
298  const float * up = u;
299  for (l = 0; l < nu; l++) {
300  double ip = 0;
301 
302  for (j = 0; j < d; j++)
303  ip += up[j] * x[j];
304  ip *= 2;
305 
306  for (j = 0; j < d; j++)
307  x[j] -= ip * up[j];
308 
309  up += d;
310  }
311  x += d;
312  }
313 }
314 
315 
316 
317 
318 
319 /***************************************************************************
320  * Matrix/vector ops
321  ***************************************************************************/
322 
323 
324 
325 /* Compute the inner product between a vector x and
326  a set of ny vectors y.
327  These functions are not intended to replace BLAS matrix-matrix, as they
328  would be significantly less efficient in this case. */
329 void fvec_inner_products_ny (float * ip,
330  const float * x,
331  const float * y,
332  size_t d, size_t ny)
333 {
334  // Not sure which one is fastest
335 #if 0
336  {
337  FINTEGER di = d;
338  FINTEGER nyi = ny;
339  float one = 1.0, zero = 0.0;
340  FINTEGER onei = 1;
341  sgemv_ ("T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
342  }
343 #endif
344  for (size_t i = 0; i < ny; i++) {
345  ip[i] = fvec_inner_product (x, y, d);
346  y += d;
347  }
348 }
349 
350 
351 
352 
353 
354 /* Compute the L2 norm of a set of nx vectors */
355 void fvec_norms_L2 (float * __restrict nr,
356  const float * __restrict x,
357  size_t d, size_t nx)
358 {
359 
360 #pragma omp parallel for
361  for (size_t i = 0; i < nx; i++) {
362  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
363  }
364 }
365 
366 void fvec_norms_L2sqr (float * __restrict nr,
367  const float * __restrict x,
368  size_t d, size_t nx)
369 {
370 #pragma omp parallel for
371  for (size_t i = 0; i < nx; i++)
372  nr[i] = fvec_norm_L2sqr (x + i * d, d);
373 }
374 
375 
376 
377 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
378 {
379 #pragma omp parallel for
380  for (size_t i = 0; i < nx; i++) {
381  float * __restrict xi = x + i * d;
382 
383  float nr = fvec_norm_L2sqr (xi, d);
384 
385  if (nr > 0) {
386  size_t j;
387  const float inv_nr = 1.0 / sqrtf (nr);
388  for (j = 0; j < d; j++)
389  xi[j] *= inv_nr;
390  }
391  }
392 }
393 
394 
395 
396 
397 
398 
399 
400 
401 
402 
403 
404 
405 
406 
407 
408 
409 
410 /***************************************************************************
411  * KNN functions
412  ***************************************************************************/
413 
414 
415 
416 /* Find the nearest neighbors for nx queries in a set of ny vectors */
417 static void knn_inner_product_sse (const float * x,
418  const float * y,
419  size_t d, size_t nx, size_t ny,
420  float_minheap_array_t * res)
421 {
422  size_t k = res->k;
423  size_t check_period = InterruptCallback::get_period_hint (ny * d);
424 
425  for (size_t i0 = 0; i0 < nx; i0 += check_period) {
426  size_t i1 = std::min(i0 + check_period, nx);
427 
428 #pragma omp parallel for
429  for (size_t i = i0; i < i1; i++) {
430  const float * x_i = x + i * d;
431  const float * y_j = y;
432 
433  float * __restrict simi = res->get_val(i);
434  long * __restrict idxi = res->get_ids (i);
435 
436  minheap_heapify (k, simi, idxi);
437 
438  for (size_t j = 0; j < ny; j++) {
439  float ip = fvec_inner_product (x_i, y_j, d);
440 
441  if (ip > simi[0]) {
442  minheap_pop (k, simi, idxi);
443  minheap_push (k, simi, idxi, ip, j);
444  }
445  y_j += d;
446  }
447  minheap_reorder (k, simi, idxi);
448  }
449  InterruptCallback::check ();
450  }
451 
452 }
453 
454 static void knn_L2sqr_sse (
455  const float * x,
456  const float * y,
457  size_t d, size_t nx, size_t ny,
458  float_maxheap_array_t * res)
459 {
460  size_t k = res->k;
461 
462  size_t check_period = InterruptCallback::get_period_hint (ny * d);
463 
464  for (size_t i0 = 0; i0 < nx; i0 += check_period) {
465  size_t i1 = std::min(i0 + check_period, nx);
466 
467 #pragma omp parallel for
468  for (size_t i = i0; i < i1; i++) {
469  const float * x_i = x + i * d;
470  const float * y_j = y;
471  size_t j;
472  float * simi = res->get_val(i);
473  long * idxi = res->get_ids (i);
474 
475  maxheap_heapify (k, simi, idxi);
476  for (j = 0; j < ny; j++) {
477  float disij = fvec_L2sqr (x_i, y_j, d);
478 
479  if (disij < simi[0]) {
480  maxheap_pop (k, simi, idxi);
481  maxheap_push (k, simi, idxi, disij, j);
482  }
483  y_j += d;
484  }
485  maxheap_reorder (k, simi, idxi);
486  }
487  InterruptCallback::check ();
488  }
489 
490 }
491 
492 
493 /** Find the nearest neighbors for nx queries in a set of ny vectors */
494 static void knn_inner_product_blas (
495  const float * x,
496  const float * y,
497  size_t d, size_t nx, size_t ny,
498  float_minheap_array_t * res)
499 {
500  res->heapify ();
501 
502  // BLAS does not like empty matrices
503  if (nx == 0 || ny == 0) return;
504 
505  /* block sizes */
506  const size_t bs_x = 4096, bs_y = 1024;
507  // const size_t bs_x = 16, bs_y = 16;
508  std::unique_ptr<float[]> ip_block(new float[bs_x * bs_y]);
509 
510  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
511  size_t i1 = i0 + bs_x;
512  if(i1 > nx) i1 = nx;
513 
514  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
515  size_t j1 = j0 + bs_y;
516  if (j1 > ny) j1 = ny;
517  /* compute the actual dot products */
518  {
519  float one = 1, zero = 0;
520  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
521  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
522  y + j0 * d, &di,
523  x + i0 * d, &di, &zero,
524  ip_block.get(), &nyi);
525  }
526 
527  /* collect maxima */
528  res->addn (j1 - j0, ip_block.get(), j0, i0, i1 - i0);
529  }
530  InterruptCallback::check ();
531  }
532  res->reorder ();
533 }
534 
535 // distance correction is an operator that can be applied to transform
536 // the distances
537 template<class DistanceCorrection>
538 static void knn_L2sqr_blas (const float * x,
539  const float * y,
540  size_t d, size_t nx, size_t ny,
541  float_maxheap_array_t * res,
542  const DistanceCorrection &corr)
543 {
544  res->heapify ();
545 
546  // BLAS does not like empty matrices
547  if (nx == 0 || ny == 0) return;
548 
549  size_t k = res->k;
550 
551  /* block sizes */
552  const size_t bs_x = 4096, bs_y = 1024;
553  // const size_t bs_x = 16, bs_y = 16;
554  float *ip_block = new float[bs_x * bs_y];
555  float *x_norms = new float[nx];
556  float *y_norms = new float[ny];
557  ScopeDeleter<float> del1(ip_block), del3(x_norms), del2(y_norms);
558 
559  fvec_norms_L2sqr (x_norms, x, d, nx);
560  fvec_norms_L2sqr (y_norms, y, d, ny);
561 
562 
563  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
564  size_t i1 = i0 + bs_x;
565  if(i1 > nx) i1 = nx;
566 
567  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
568  size_t j1 = j0 + bs_y;
569  if (j1 > ny) j1 = ny;
570  /* compute the actual dot products */
571  {
572  float one = 1, zero = 0;
573  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
574  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
575  y + j0 * d, &di,
576  x + i0 * d, &di, &zero,
577  ip_block, &nyi);
578  }
579 
580  /* collect minima */
581 #pragma omp parallel for
582  for (size_t i = i0; i < i1; i++) {
583  float * __restrict simi = res->get_val(i);
584  long * __restrict idxi = res->get_ids (i);
585  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
586 
587  for (size_t j = j0; j < j1; j++) {
588  float ip = *ip_line++;
589  float dis = x_norms[i] + y_norms[j] - 2 * ip;
590 
591  // negative values can occur for identical vectors
592  // due to roundoff errors
593  if (dis < 0) dis = 0;
594 
595  dis = corr (dis, i, j);
596 
597  if (dis < simi[0]) {
598  maxheap_pop (k, simi, idxi);
599  maxheap_push (k, simi, idxi, dis, j);
600  }
601  }
602  }
603  }
604  InterruptCallback::check ();
605  }
606  res->reorder ();
607 
608 }
609 
610 
611 
612 
613 
614 
615 
616 
617 
618 /*******************************************************
619  * KNN driver functions
620  *******************************************************/
621 
622 int distance_compute_blas_threshold = 20;
623 
624 void knn_inner_product (const float * x,
625  const float * y,
626  size_t d, size_t nx, size_t ny,
627  float_minheap_array_t * res)
628 {
629  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
630  knn_inner_product_sse (x, y, d, nx, ny, res);
631  } else {
632  knn_inner_product_blas (x, y, d, nx, ny, res);
633  }
634 }
635 
636 
637 
639  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
640  return dis;
641  }
642 };
643 
644 void knn_L2sqr (const float * x,
645  const float * y,
646  size_t d, size_t nx, size_t ny,
647  float_maxheap_array_t * res)
648 {
649  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
650  knn_L2sqr_sse (x, y, d, nx, ny, res);
651  } else {
653  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
654  }
655 }
656 
658  const float *base_shift;
659  float operator()(float dis, size_t /*qno*/, size_t bno) const {
660  return dis - base_shift[bno];
661  }
662 };
663 
665  const float * x,
666  const float * y,
667  size_t d, size_t nx, size_t ny,
668  float_maxheap_array_t * res,
669  const float *base_shift)
670 {
671  BaseShiftDistanceCorrection corr = {base_shift};
672  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
673 }
674 
675 
676 
677 /***************************************************************************
678  * compute a subset of distances
679  ***************************************************************************/
680 
681 /* compute the inner product between x and a subset y of ny vectors,
682  whose indices are given by idy. */
683 void fvec_inner_products_by_idx (float * __restrict ip,
684  const float * x,
685  const float * y,
686  const long * __restrict ids, /* for y vecs */
687  size_t d, size_t nx, size_t ny)
688 {
689 #pragma omp parallel for
690  for (size_t j = 0; j < nx; j++) {
691  const long * __restrict idsj = ids + j * ny;
692  const float * xj = x + j * d;
693  float * __restrict ipj = ip + j * ny;
694  for (size_t i = 0; i < ny; i++) {
695  if (idsj[i] < 0)
696  continue;
697  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
698  }
699  }
700 }
701 
702 /* compute the inner product between x and a subset y of ny vectors,
703  whose indices are given by idy. */
704 void fvec_L2sqr_by_idx (float * __restrict dis,
705  const float * x,
706  const float * y,
707  const long * __restrict ids, /* ids of y vecs */
708  size_t d, size_t nx, size_t ny)
709 {
710 #pragma omp parallel for
711  for (size_t j = 0; j < nx; j++) {
712  const long * __restrict idsj = ids + j * ny;
713  const float * xj = x + j * d;
714  float * __restrict disj = dis + j * ny;
715  for (size_t i = 0; i < ny; i++) {
716  if (idsj[i] < 0)
717  continue;
718  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
719  }
720  }
721 }
722 
723 
724 
725 
726 
727 /* Find the nearest neighbors for nx queries in a set of ny vectors
728  indexed by ids. May be useful for re-ranking a pre-selected vector list */
729 void knn_inner_products_by_idx (const float * x,
730  const float * y,
731  const long * ids,
732  size_t d, size_t nx, size_t ny,
733  float_minheap_array_t * res)
734 {
735  size_t k = res->k;
736 
737 #pragma omp parallel for
738  for (size_t i = 0; i < nx; i++) {
739  const float * x_ = x + i * d;
740  const long * idsi = ids + i * ny;
741  size_t j;
742  float * __restrict simi = res->get_val(i);
743  long * __restrict idxi = res->get_ids (i);
744  minheap_heapify (k, simi, idxi);
745 
746  for (j = 0; j < ny; j++) {
747  if (idsi[j] < 0) break;
748  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
749 
750  if (ip > simi[0]) {
751  minheap_pop (k, simi, idxi);
752  minheap_push (k, simi, idxi, ip, idsi[j]);
753  }
754  }
755  minheap_reorder (k, simi, idxi);
756  }
757 
758 }
759 
760 void knn_L2sqr_by_idx (const float * x,
761  const float * y,
762  const long * __restrict ids,
763  size_t d, size_t nx, size_t ny,
764  float_maxheap_array_t * res)
765 {
766  size_t k = res->k;
767 
768 #pragma omp parallel for
769  for (size_t i = 0; i < nx; i++) {
770  const float * x_ = x + i * d;
771  const long * __restrict idsi = ids + i * ny;
772  float * __restrict simi = res->get_val(i);
773  long * __restrict idxi = res->get_ids (i);
774  maxheap_heapify (res->k, simi, idxi);
775  for (size_t j = 0; j < ny; j++) {
776  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
777 
778  if (disij < simi[0]) {
779  maxheap_pop (k, simi, idxi);
780  maxheap_push (k, simi, idxi, disij, idsi[j]);
781  }
782  }
783  maxheap_reorder (res->k, simi, idxi);
784  }
785 
786 }
787 
788 
789 
790 
791 
792 /***************************************************************************
793  * Range search
794  ***************************************************************************/
795 
796 /** Find the nearest neighbors for nx queries in a set of ny vectors
797  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
798  */
799 template <bool compute_l2>
800 static void range_search_blas (
801  const float * x,
802  const float * y,
803  size_t d, size_t nx, size_t ny,
804  float radius,
805  RangeSearchResult *result)
806 {
807 
808  // BLAS does not like empty matrices
809  if (nx == 0 || ny == 0) return;
810 
811  /* block sizes */
812  const size_t bs_x = 4096, bs_y = 1024;
813  // const size_t bs_x = 16, bs_y = 16;
814  float *ip_block = new float[bs_x * bs_y];
815  ScopeDeleter<float> del0(ip_block);
816 
817  float *x_norms = nullptr, *y_norms = nullptr;
818  ScopeDeleter<float> del1, del2;
819  if (compute_l2) {
820  x_norms = new float[nx];
821  del1.set (x_norms);
822  fvec_norms_L2sqr (x_norms, x, d, nx);
823 
824  y_norms = new float[ny];
825  del2.set (y_norms);
826  fvec_norms_L2sqr (y_norms, y, d, ny);
827  }
828 
829  std::vector <RangeSearchPartialResult *> partial_results;
830 
831  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
832  size_t j1 = j0 + bs_y;
833  if (j1 > ny) j1 = ny;
834  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
835  partial_results.push_back (pres);
836 
837  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
838  size_t i1 = i0 + bs_x;
839  if(i1 > nx) i1 = nx;
840 
841  /* compute the actual dot products */
842  {
843  float one = 1, zero = 0;
844  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
845  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
846  y + j0 * d, &di,
847  x + i0 * d, &di, &zero,
848  ip_block, &nyi);
849  }
850 
851 
852  for (size_t i = i0; i < i1; i++) {
853  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
854 
855  RangeQueryResult & qres = pres->new_result (i);
856 
857  for (size_t j = j0; j < j1; j++) {
858  float ip = *ip_line++;
859  if (compute_l2) {
860  float dis = x_norms[i] + y_norms[j] - 2 * ip;
861  if (dis < radius) {
862  qres.add (dis, j);
863  }
864  } else {
865  if (ip > radius) {
866  qres.add (ip, j);
867  }
868  }
869  }
870  }
871  }
872  InterruptCallback::check ();
873  }
874 
875  RangeSearchPartialResult::merge (partial_results);
876 }
877 
878 
879 template <bool compute_l2>
880 static void range_search_sse (const float * x,
881  const float * y,
882  size_t d, size_t nx, size_t ny,
883  float radius,
884  RangeSearchResult *res)
885 {
886  FAISS_THROW_IF_NOT (d % 4 == 0);
887 
888  size_t check_period = InterruptCallback::get_period_hint (ny * d);
889  bool interrupted = false;
890 
891 #pragma omp parallel
892  {
893  RangeSearchPartialResult pres (res);
894 
895  for (size_t i0 = 0; i0 < nx; i0 += check_period) {
896  size_t i1 = std::min(i0 + check_period, nx);
897 
898 #pragma omp for
899  for (size_t i = i0; i < i1; i++) {
900  const float * x_ = x + i * d;
901  const float * y_ = y;
902  size_t j;
903 
904  RangeQueryResult & qres = pres.new_result (i);
905 
906  for (j = 0; j < ny; j++) {
907  if (compute_l2) {
908  float disij = fvec_L2sqr (x_, y_, d);
909  if (disij < radius) {
910  qres.add (disij, j);
911  }
912  } else {
913  float ip = fvec_inner_product (x_, y_, d);
914  if (ip > radius) {
915  qres.add (ip, j);
916  }
917  }
918  y_ += d;
919  }
920 
921  }
922 
923  if (InterruptCallback::is_interrupted ()) {
924  interrupted = true;
925  }
926 
927 #pragma omp barrier
928  if (interrupted) {
929  break;
930  }
931  }
932  pres.finalize ();
933  }
934 
935  if (interrupted) {
936  FAISS_THROW_MSG ("computation interrupted");
937  }
938 }
939 
940 
941 
942 
943 
945  const float * x,
946  const float * y,
947  size_t d, size_t nx, size_t ny,
948  float radius,
949  RangeSearchResult *res)
950 {
951 
952  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
953  range_search_sse<true> (x, y, d, nx, ny, radius, res);
954  } else {
955  range_search_blas<true> (x, y, d, nx, ny, radius, res);
956  }
957 }
958 
960  const float * x,
961  const float * y,
962  size_t d, size_t nx, size_t ny,
963  float radius,
964  RangeSearchResult *res)
965 {
966 
967  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
968  range_search_sse<false> (x, y, d, nx, ny, radius, res);
969  } else {
970  range_search_blas<false> (x, y, d, nx, ny, radius, res);
971  }
972 }
973 
974 
975 
976 /***************************************************************************
977  * Some matrix manipulation functions
978  ***************************************************************************/
979 
980 
981 /* This function exists because the Torch counterpart is extremly slow
982  (not multi-threaded + unexpected overhead even in single thread).
983  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
984 void inner_product_to_L2sqr (float * __restrict dis,
985  const float * nr1,
986  const float * nr2,
987  size_t n1, size_t n2)
988 {
989 
990 #pragma omp parallel for
991  for (size_t j = 0 ; j < n1 ; j++) {
992  float * disj = dis + j * n2;
993  for (size_t i = 0 ; i < n2 ; i++)
994  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
995  }
996 }
997 
998 
999 void matrix_qr (int m, int n, float *a)
1000 {
1001  FAISS_THROW_IF_NOT (m >= n);
1002  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1003  std::vector<float> tau (ki);
1004  FINTEGER lwork = -1, info;
1005  float work_size;
1006 
1007  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1008  &work_size, &lwork, &info);
1009  lwork = size_t(work_size);
1010  std::vector<float> work (lwork);
1011 
1012  sgeqrf_ (&mi, &ni, a, &mi,
1013  tau.data(), work.data(), &lwork, &info);
1014 
1015  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1016  work.data(), &lwork, &info);
1017 
1018 }
1019 
1020 
1021 void pairwise_L2sqr (long d,
1022  long nq, const float *xq,
1023  long nb, const float *xb,
1024  float *dis,
1025  long ldq, long ldb, long ldd)
1026 {
1027  if (nq == 0 || nb == 0) return;
1028  if (ldq == -1) ldq = d;
1029  if (ldb == -1) ldb = d;
1030  if (ldd == -1) ldd = nb;
1031 
1032  // store in beginning of distance matrix to avoid malloc
1033  float *b_norms = dis;
1034 
1035 #pragma omp parallel for
1036  for (long i = 0; i < nb; i++)
1037  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1038 
1039 #pragma omp parallel for
1040  for (long i = 1; i < nq; i++) {
1041  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1042  for (long j = 0; j < nb; j++)
1043  dis[i * ldd + j] = q_norm + b_norms [j];
1044  }
1045 
1046  {
1047  float q_norm = fvec_norm_L2sqr (xq, d);
1048  for (long j = 0; j < nb; j++)
1049  dis[j] += q_norm;
1050  }
1051 
1052  {
1053  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1054  float one = 1.0, minus_2 = -2.0;
1055 
1056  sgemm_ ("Transposed", "Not transposed",
1057  &nbi, &nqi, &di,
1058  &minus_2,
1059  xb, &ldbi,
1060  xq, &ldqi,
1061  &one, dis, &lddi);
1062  }
1063 
1064 
1065 }
1066 
1067 
1068 
1069 /***************************************************************************
1070  * Kmeans subroutine
1071  ***************************************************************************/
1072 
1073 // a bit above machine epsilon for float16
1074 
1075 #define EPS (1 / 1024.)
1076 
1077 /* For k-means, compute centroids given assignment of vectors to centroids */
1078 int km_update_centroids (const float * x,
1079  float * centroids,
1080  long * assign,
1081  size_t d, size_t k, size_t n,
1082  size_t k_frozen)
1083 {
1084  k -= k_frozen;
1085  centroids += k_frozen * d;
1086 
1087  std::vector<size_t> hassign(k);
1088  memset (centroids, 0, sizeof(*centroids) * d * k);
1089 
1090 #pragma omp parallel
1091  {
1092  int nt = omp_get_num_threads();
1093  int rank = omp_get_thread_num();
1094  // this thread is taking care of centroids c0:c1
1095  size_t c0 = (k * rank) / nt;
1096  size_t c1 = (k * (rank + 1)) / nt;
1097  const float *xi = x;
1098  size_t nacc = 0;
1099 
1100  for (size_t i = 0; i < n; i++) {
1101  long ci = assign[i];
1102  assert (ci >= 0 && ci < k + k_frozen);
1103  ci -= k_frozen;
1104  if (ci >= c0 && ci < c1) {
1105  float * c = centroids + ci * d;
1106  hassign[ci]++;
1107  for (size_t j = 0; j < d; j++)
1108  c[j] += xi[j];
1109  nacc++;
1110  }
1111  xi += d;
1112  }
1113 
1114  }
1115 
1116 #pragma omp parallel for
1117  for (size_t ci = 0; ci < k; ci++) {
1118  float * c = centroids + ci * d;
1119  float ni = (float) hassign[ci];
1120  if (ni != 0) {
1121  for (size_t j = 0; j < d; j++)
1122  c[j] /= ni;
1123  }
1124  }
1125 
1126  /* Take care of void clusters */
1127  size_t nsplit = 0;
1128  RandomGenerator rng (1234);
1129  for (size_t ci = 0; ci < k; ci++) {
1130  if (hassign[ci] == 0) { /* need to redefine a centroid */
1131  size_t cj;
1132  for (cj = 0; 1; cj = (cj + 1) % k) {
1133  /* probability to pick this cluster for split */
1134  float p = (hassign[cj] - 1.0) / (float) (n - k);
1135  float r = rng.rand_float ();
1136  if (r < p) {
1137  break; /* found our cluster to be split */
1138  }
1139  }
1140  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1141 
1142  /* small symmetric pertubation. Much better than */
1143  for (size_t j = 0; j < d; j++) {
1144  if (j % 2 == 0) {
1145  centroids[ci * d + j] *= 1 + EPS;
1146  centroids[cj * d + j] *= 1 - EPS;
1147  } else {
1148  centroids[ci * d + j] *= 1 - EPS;
1149  centroids[cj * d + j] *= 1 + EPS;
1150  }
1151  }
1152 
1153  /* assume even split of the cluster */
1154  hassign[ci] = hassign[cj] / 2;
1155  hassign[cj] -= hassign[ci];
1156  nsplit++;
1157  }
1158  }
1159 
1160  return nsplit;
1161 }
1162 
1163 #undef EPS
1164 
1165 
1166 
1167 /***************************************************************************
1168  * Result list routines
1169  ***************************************************************************/
1170 
1171 
1172 void ranklist_handle_ties (int k, long *idx, const float *dis)
1173 {
1174  float prev_dis = -1e38;
1175  int prev_i = -1;
1176  for (int i = 0; i < k; i++) {
1177  if (dis[i] != prev_dis) {
1178  if (i > prev_i + 1) {
1179  // sort between prev_i and i - 1
1180  std::sort (idx + prev_i, idx + i);
1181  }
1182  prev_i = i;
1183  prev_dis = dis[i];
1184  }
1185  }
1186 }
1187 
1188 size_t merge_result_table_with (size_t n, size_t k,
1189  long *I0, float *D0,
1190  const long *I1, const float *D1,
1191  bool keep_min,
1192  long translation)
1193 {
1194  size_t n1 = 0;
1195 
1196 #pragma omp parallel reduction(+:n1)
1197  {
1198  std::vector<long> tmpI (k);
1199  std::vector<float> tmpD (k);
1200 
1201 #pragma omp for
1202  for (size_t i = 0; i < n; i++) {
1203  long *lI0 = I0 + i * k;
1204  float *lD0 = D0 + i * k;
1205  const long *lI1 = I1 + i * k;
1206  const float *lD1 = D1 + i * k;
1207  size_t r0 = 0;
1208  size_t r1 = 0;
1209 
1210  if (keep_min) {
1211  for (size_t j = 0; j < k; j++) {
1212 
1213  if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1214  tmpD[j] = lD0[r0];
1215  tmpI[j] = lI0[r0];
1216  r0++;
1217  } else if (lD1[r1] >= 0) {
1218  tmpD[j] = lD1[r1];
1219  tmpI[j] = lI1[r1] + translation;
1220  r1++;
1221  } else { // both are NaNs
1222  tmpD[j] = NAN;
1223  tmpI[j] = -1;
1224  }
1225  }
1226  } else {
1227  for (size_t j = 0; j < k; j++) {
1228  if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1229  tmpD[j] = lD0[r0];
1230  tmpI[j] = lI0[r0];
1231  r0++;
1232  } else if (lD1[r1] >= 0) {
1233  tmpD[j] = lD1[r1];
1234  tmpI[j] = lI1[r1] + translation;
1235  r1++;
1236  } else { // both are NaNs
1237  tmpD[j] = NAN;
1238  tmpI[j] = -1;
1239  }
1240  }
1241  }
1242  n1 += r1;
1243  memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
1244  memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
1245  }
1246  }
1247 
1248  return n1;
1249 }
1250 
1251 
1252 
1253 size_t ranklist_intersection_size (size_t k1, const long *v1,
1254  size_t k2, const long *v2_in)
1255 {
1256  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1257  long *v2 = new long [k2];
1258  memcpy (v2, v2_in, sizeof (long) * k2);
1259  std::sort (v2, v2 + k2);
1260  { // de-dup v2
1261  long prev = -1;
1262  size_t wp = 0;
1263  for (size_t i = 0; i < k2; i++) {
1264  if (v2 [i] != prev) {
1265  v2[wp++] = prev = v2 [i];
1266  }
1267  }
1268  k2 = wp;
1269  }
1270  const long seen_flag = 1L << 60;
1271  size_t count = 0;
1272  for (size_t i = 0; i < k1; i++) {
1273  long q = v1 [i];
1274  size_t i0 = 0, i1 = k2;
1275  while (i0 + 1 < i1) {
1276  size_t imed = (i1 + i0) / 2;
1277  long piv = v2 [imed] & ~seen_flag;
1278  if (piv <= q) i0 = imed;
1279  else i1 = imed;
1280  }
1281  if (v2 [i0] == q) {
1282  count++;
1283  v2 [i0] |= seen_flag;
1284  }
1285  }
1286  delete [] v2;
1287 
1288  return count;
1289 }
1290 
1291 double imbalance_factor (int k, const int *hist) {
1292  double tot = 0, uf = 0;
1293 
1294  for (int i = 0 ; i < k ; i++) {
1295  tot += hist[i];
1296  uf += hist[i] * (double) hist[i];
1297  }
1298  uf = uf * k / (tot * tot);
1299 
1300  return uf;
1301 }
1302 
1303 
1304 double imbalance_factor (int n, int k, const long *assign) {
1305  std::vector<int> hist(k, 0);
1306  for (int i = 0; i < n; i++) {
1307  hist[assign[i]]++;
1308  }
1309 
1310  return imbalance_factor (k, hist.data());
1311 }
1312 
1313 
1314 
1315 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1316  memset (hist, 0, sizeof(hist[0]) * vmax);
1317  int nout = 0;
1318  while (n--) {
1319  if (v[n] < 0 || v[n] >= vmax) nout++;
1320  else hist[v[n]]++;
1321  }
1322  return nout;
1323 }
1324 
1325 
1326 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1327 {
1328  FAISS_THROW_IF_NOT (nbits % 8 == 0);
1329  size_t d = nbits / 8;
1330  std::vector<int> accu(d * 256);
1331  const uint8_t *c = codes;
1332  for (size_t i = 0; i < n; i++)
1333  for(int j = 0; j < d; j++)
1334  accu[j * 256 + *c++]++;
1335  memset (hist, 0, sizeof(*hist) * nbits);
1336  for (int i = 0; i < d; i++) {
1337  const int *ai = accu.data() + i * 256;
1338  int * hi = hist + i * 8;
1339  for (int j = 0; j < 256; j++)
1340  for (int k = 0; k < 8; k++)
1341  if ((j >> k) & 1)
1342  hi[k] += ai[j];
1343  }
1344 
1345 }
1346 
1347 
1348 
1349 size_t ivec_checksum (size_t n, const int *a)
1350 {
1351  size_t cs = 112909;
1352  while (n--) cs = cs * 65713 + a[n] * 1686049;
1353  return cs;
1354 }
1355 
1356 
1357 namespace {
1358  struct ArgsortComparator {
1359  const float *vals;
1360  bool operator() (const size_t a, const size_t b) const {
1361  return vals[a] < vals[b];
1362  }
1363  };
1364 
1365  struct SegmentS {
1366  size_t i0; // begin pointer in the permutation array
1367  size_t i1; // end
1368  size_t len() const {
1369  return i1 - i0;
1370  }
1371  };
1372 
1373  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1374  // extended to > 1 merge thread
1375 
1376  // merges 2 ranges that should be consecutive on the source into
1377  // the union of the two on the destination
1378  template<typename T>
1379  void parallel_merge (const T *src, T *dst,
1380  SegmentS &s1, SegmentS & s2, int nt,
1381  const ArgsortComparator & comp) {
1382  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1383  std::swap(s1, s2);
1384  }
1385 
1386  // compute sub-ranges for each thread
1387  SegmentS s1s[nt], s2s[nt], sws[nt];
1388  s2s[0].i0 = s2.i0;
1389  s2s[nt - 1].i1 = s2.i1;
1390 
1391  // not sure parallel actually helps here
1392 #pragma omp parallel for num_threads(nt)
1393  for (int t = 0; t < nt; t++) {
1394  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1395  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1396 
1397  if (t + 1 < nt) {
1398  T pivot = src[s1s[t].i1];
1399  size_t i0 = s2.i0, i1 = s2.i1;
1400  while (i0 + 1 < i1) {
1401  size_t imed = (i1 + i0) / 2;
1402  if (comp (pivot, src[imed])) {i1 = imed; }
1403  else {i0 = imed; }
1404  }
1405  s2s[t].i1 = s2s[t + 1].i0 = i1;
1406  }
1407  }
1408  s1.i0 = std::min(s1.i0, s2.i0);
1409  s1.i1 = std::max(s1.i1, s2.i1);
1410  s2 = s1;
1411  sws[0].i0 = s1.i0;
1412  for (int t = 0; t < nt; t++) {
1413  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1414  if (t + 1 < nt) {
1415  sws[t + 1].i0 = sws[t].i1;
1416  }
1417  }
1418  assert(sws[nt - 1].i1 == s1.i1);
1419 
1420  // do the actual merging
1421 #pragma omp parallel for num_threads(nt)
1422  for (int t = 0; t < nt; t++) {
1423  SegmentS sw = sws[t];
1424  SegmentS s1t = s1s[t];
1425  SegmentS s2t = s2s[t];
1426  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1427  for (;;) {
1428  // assert (sw.len() == s1t.len() + s2t.len());
1429  if (comp(src[s1t.i0], src[s2t.i0])) {
1430  dst[sw.i0++] = src[s1t.i0++];
1431  if (s1t.i0 == s1t.i1) break;
1432  } else {
1433  dst[sw.i0++] = src[s2t.i0++];
1434  if (s2t.i0 == s2t.i1) break;
1435  }
1436  }
1437  }
1438  if (s1t.len() > 0) {
1439  assert(s1t.len() == sw.len());
1440  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1441  } else if (s2t.len() > 0) {
1442  assert(s2t.len() == sw.len());
1443  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1444  }
1445  }
1446  }
1447 
1448 };
1449 
1450 void fvec_argsort (size_t n, const float *vals,
1451  size_t *perm)
1452 {
1453  for (size_t i = 0; i < n; i++) perm[i] = i;
1454  ArgsortComparator comp = {vals};
1455  std::sort (perm, perm + n, comp);
1456 }
1457 
1458 void fvec_argsort_parallel (size_t n, const float *vals,
1459  size_t *perm)
1460 {
1461  size_t * perm2 = new size_t[n];
1462  // 2 result tables, during merging, flip between them
1463  size_t *permB = perm2, *permA = perm;
1464 
1465  int nt = omp_get_max_threads();
1466  { // prepare correct permutation so that the result ends in perm
1467  // at final iteration
1468  int nseg = nt;
1469  while (nseg > 1) {
1470  nseg = (nseg + 1) / 2;
1471  std::swap (permA, permB);
1472  }
1473  }
1474 
1475 #pragma omp parallel
1476  for (size_t i = 0; i < n; i++) permA[i] = i;
1477 
1478  ArgsortComparator comp = {vals};
1479 
1480  SegmentS segs[nt];
1481 
1482  // independent sorts
1483 #pragma omp parallel for
1484  for (int t = 0; t < nt; t++) {
1485  size_t i0 = t * n / nt;
1486  size_t i1 = (t + 1) * n / nt;
1487  SegmentS seg = {i0, i1};
1488  std::sort (permA + seg.i0, permA + seg.i1, comp);
1489  segs[t] = seg;
1490  }
1491  int prev_nested = omp_get_nested();
1492  omp_set_nested(1);
1493 
1494  int nseg = nt;
1495  while (nseg > 1) {
1496  int nseg1 = (nseg + 1) / 2;
1497  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1498  int sub_nseg1 = nseg / 2;
1499 
1500 #pragma omp parallel for num_threads(nseg1)
1501  for (int s = 0; s < nseg; s += 2) {
1502  if (s + 1 == nseg) { // otherwise isolated segment
1503  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1504  segs[s].len() * sizeof(size_t));
1505  } else {
1506  int t0 = s * sub_nt / sub_nseg1;
1507  int t1 = (s + 1) * sub_nt / sub_nseg1;
1508  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1509  parallel_merge(permA, permB, segs[s], segs[s + 1],
1510  t1 - t0, comp);
1511  }
1512  }
1513  for (int s = 0; s < nseg; s += 2)
1514  segs[s / 2] = segs[s];
1515  nseg = nseg1;
1516  std::swap (permA, permB);
1517  }
1518  assert (permA == perm);
1519  omp_set_nested(prev_nested);
1520  delete [] perm2;
1521 }
1522 
1523 
1524 
1525 
1526 
1527 
1528 
1529 
1530 
1531 
1532 
1533 
1534 
1535 
1536 
1537 
1538 
1539 
1541  size_t d, size_t *n, size_t nmax, const float *x,
1542  bool verbose, long seed)
1543 {
1544 
1545  if (*n <= nmax) return x; // nothing to do
1546 
1547  size_t n2 = nmax;
1548  if (verbose) {
1549  printf (" Input training set too big (max size is %ld), sampling "
1550  "%ld / %ld vectors\n", nmax, n2, *n);
1551  }
1552  std::vector<int> subset (*n);
1553  rand_perm (subset.data (), *n, seed);
1554  float *x_subset = new float[n2 * d];
1555  for (long i = 0; i < n2; i++)
1556  memcpy (&x_subset[i * d],
1557  &x[subset[i] * size_t(d)],
1558  sizeof (x[0]) * d);
1559  *n = n2;
1560  return x_subset;
1561 }
1562 
1563 
1564 void binary_to_real(size_t d, const uint8_t *x_in, float *x_out) {
1565  for (size_t i = 0; i < d; ++i) {
1566  x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
1567  }
1568 }
1569 
1570 void real_to_binary(size_t d, const float *x_in, uint8_t *x_out) {
1571  for (size_t i = 0; i < d / 8; ++i) {
1572  uint8_t b = 0;
1573  for (int j = 0; j < 8; ++j) {
1574  if (x_in[8 * i + j] > 0) {
1575  b |= (1 << j);
1576  }
1577  }
1578  x_out[i] = b;
1579  }
1580 }
1581 
1582 
1583 // from Python's stringobject.c
1584 uint64_t hash_bytes (const uint8_t *bytes, long n) {
1585  const uint8_t *p = bytes;
1586  uint64_t x = (uint64_t)(*p) << 7;
1587  long len = n;
1588  while (--len >= 0) {
1589  x = (1000003*x) ^ *p++;
1590  }
1591  x ^= n;
1592  return x;
1593 }
1594 
1595 
1597  omp_set_num_threads(10);
1598 
1599  if (omp_get_max_threads() != 10) {
1600  return false;
1601  }
1602 
1603  std::vector<int> nt_per_thread(10);
1604  size_t sum = 0;
1605  bool in_parallel = true;
1606 #pragma omp parallel reduction(+: sum)
1607  {
1608  if (!omp_in_parallel()) {
1609  in_parallel = false;
1610  }
1611 
1612  int nt = omp_get_num_threads();
1613  int rank = omp_get_thread_num();
1614 
1615  nt_per_thread[rank] = nt;
1616 #pragma omp for
1617  for(int i = 0; i < 1000 * 1000 * 10; i++) {
1618  sum += i;
1619  }
1620  }
1621 
1622  if (!in_parallel) {
1623  return false;
1624  }
1625  if (nt_per_thread[0] != 10) {
1626  return false;
1627  }
1628  if (sum == 0) {
1629  return false;
1630  }
1631 
1632  return true;
1633 }
1634 
1635 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:47
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1078
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:664
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils_simd.cpp:501
bool check_openmp()
Definition: utils.cpp:1596
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1326
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1540
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1172
float rand_float()
between 0 and 1
Definition: utils.cpp:129
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1315
uint64_t hash_bytes(const uint8_t *bytes, long n)
Definition: utils.cpp:1584
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1188
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1253
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1021
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:959
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:624
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:69
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
Definition: utils.cpp:1570
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1291
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils_simd.cpp:515
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:944
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:999
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1349
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:644
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)
Definition: utils.cpp:1564