Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #include "utils.h"
13 
14 #include <cstdio>
15 #include <cassert>
16 #include <cstring>
17 #include <cmath>
18 
19 #include <immintrin.h>
20 
21 
22 #include <sys/time.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include <omp.h>
27 
28 
29 #include <algorithm>
30 #include <vector>
31 
32 #include "AuxIndexStructures.h"
33 #include "FaissAssert.h"
34 
35 
36 
37 #ifndef FINTEGER
38 #define FINTEGER long
39 #endif
40 
41 
42 extern "C" {
43 
44 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
45 
46 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
47  n, FINTEGER *k, const float *alpha, const float *a,
48  FINTEGER *lda, const float *b, FINTEGER *
49  ldb, float *beta, float *c, FINTEGER *ldc);
50 
51 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
52 
53 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
54  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
55 
56 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
57  FINTEGER *lda, float *tau, float *work,
58  FINTEGER *lwork, FINTEGER *info);
59 
60 
61 }
62 
63 
64 /**************************************************
65  * Get some stats about the system
66  **************************************************/
67 
68 namespace faiss {
69 
70 #ifdef __AVX__
71 #define USE_AVX
72 #endif
73 
74 double getmillisecs () {
75  struct timeval tv;
76  gettimeofday (&tv, nullptr);
77  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
78 }
79 
80 
81 #ifdef __linux__
82 
83 size_t get_mem_usage_kb ()
84 {
85  int pid = getpid ();
86  char fname[256];
87  snprintf (fname, 256, "/proc/%d/status", pid);
88  FILE * f = fopen (fname, "r");
89  FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
90  size_t sz = 0;
91  for (;;) {
92  char buf [256];
93  if (!fgets (buf, 256, f)) break;
94  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
95  }
96  fclose (f);
97  return sz;
98 }
99 
100 #elif __APPLE__
101 
102 size_t get_mem_usage_kb ()
103 {
104  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
105  return 0;
106 }
107 
108 #endif
109 
110 
111 
112 /**************************************************
113  * Random data generation functions
114  **************************************************/
115 
116 /**
117  * The definition of random functions depends on the architecture:
118  *
119  * - for Linux, we rely on re-entrant functions (random_r). This
120  * provides good quality reproducible random sequences.
121  *
122  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
123  * this function that it removed its definition form stdlib.h, so we
124  * re-declare it below. Fortunately, since it is deprecated, its
125  * prototype should not change much in the forerseeable future.
126  *
127  * Unfortunately, system designers are more concerned with making the
128  * most unpredictable random sequences for cryptographic use, when in
129  * scientific contexts what acutally matters is having reproducible
130  * squences in multi-threaded contexts.
131  */
132 
133 
134 #ifdef __linux__
135 
136 
137 
138 
140 {
141  int32_t a;
142  random_r (&rand_data, &a);
143  return a;
144 }
145 
147 {
148  int32_t a, b;
149  random_r (&rand_data, &a);
150  random_r (&rand_data, &b);
151  return long(a) | long(b) << 31;
152 }
153 
154 
156 {
157  memset (&rand_data, 0, sizeof (rand_data));
158  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
159 }
160 
161 
162 RandomGenerator::RandomGenerator (const RandomGenerator & other)
163 {
164  memcpy (rand_state, other.rand_state, sizeof(rand_state));
165  rand_data = other.rand_data;
166  setstate_r (rand_state, &rand_data);
167 }
168 
169 
170 #elif __APPLE__
171 
172 extern "C" {
173 int rand_r(unsigned *seed);
174 }
175 
177 {
178  rand_state = seed;
179 }
180 
181 
182 RandomGenerator::RandomGenerator (const RandomGenerator & other)
183 {
184  rand_state = other.rand_state;
185 }
186 
187 
189 {
190  // RAND_MAX is 31 bits
191  // try to add more randomness in the lower bits
192  int lowbits = rand_r(&rand_state) >> 15;
193  return rand_r(&rand_state) ^ lowbits;
194 }
195 
197 {
198  return long(random()) | long(random()) << 31;
199 }
200 
201 
202 
203 #endif
204 
206 { // this suffers form non-uniform probabilities when max is not a
207  // power of 2, but if RAND_MAX >> max the bias is limited.
208  return rand_int () % max;
209 }
210 
212 {
213  return rand_int() / float(1L << 31);
214 }
215 
216 double RandomGenerator::rand_double ()
217 {
218  return rand_long() / double(1L << 62);
219 }
220 
221 
222 /***********************************************************************
223  * Random functions in this C file only exist because Torch
224  * counterparts are slow and not multi-threaded. Typical use is for
225  * more than 1-100 billion values. */
226 
227 
228 /* Generate a set of random floating point values such that x[i] in [0,1]
229  multi-threading. For this reason, we rely on re-entreant functions. */
230 void float_rand (float * x, size_t n, long seed)
231 {
232  // only try to parallelize on large enough arrays
233  const size_t nblock = n < 1024 ? 1 : 1024;
234 
235  RandomGenerator rng0 (seed);
236  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
237 
238 #pragma omp parallel for
239  for (size_t j = 0; j < nblock; j++) {
240 
241  RandomGenerator rng (a0 + j * b0);
242 
243  const size_t istart = j * n / nblock;
244  const size_t iend = (j + 1) * n / nblock;
245 
246  for (size_t i = istart; i < iend; i++)
247  x[i] = rng.rand_float ();
248  }
249 }
250 
251 
252 void float_randn (float * x, size_t n, long seed)
253 {
254  // only try to parallelize on large enough arrays
255  const size_t nblock = n < 1024 ? 1 : 1024;
256 
257  RandomGenerator rng0 (seed);
258  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
259 
260 #pragma omp parallel for
261  for (size_t j = 0; j < nblock; j++) {
262  RandomGenerator rng (a0 + j * b0);
263 
264  double a = 0, b = 0, s = 0;
265  int state = 0; /* generate two number per "do-while" loop */
266 
267  const size_t istart = j * n / nblock;
268  const size_t iend = (j + 1) * n / nblock;
269 
270  for (size_t i = istart; i < iend; i++) {
271  /* Marsaglia's method (see Knuth) */
272  if (state == 0) {
273  do {
274  a = 2.0 * rng.rand_double () - 1;
275  b = 2.0 * rng.rand_double () - 1;
276  s = a * a + b * b;
277  } while (s >= 1.0);
278  x[i] = a * sqrt(-2.0 * log(s) / s);
279  }
280  else
281  x[i] = b * sqrt(-2.0 * log(s) / s);
282  state = 1 - state;
283  }
284  }
285 }
286 
287 
288 /* Integer versions */
289 void long_rand (long * x, size_t n, long seed)
290 {
291  // only try to parallelize on large enough arrays
292  const size_t nblock = n < 1024 ? 1 : 1024;
293 
294  RandomGenerator rng0 (seed);
295  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
296 
297 #pragma omp parallel for
298  for (size_t j = 0; j < nblock; j++) {
299 
300  RandomGenerator rng (a0 + j * b0);
301 
302  const size_t istart = j * n / nblock;
303  const size_t iend = (j + 1) * n / nblock;
304  for (size_t i = istart; i < iend; i++)
305  x[i] = rng.rand_long ();
306  }
307 }
308 
309 
310 
311 void rand_perm (int *perm, size_t n, long seed)
312 {
313  for (size_t i = 0; i < n; i++) perm[i] = i;
314 
315  RandomGenerator rng (seed);
316 
317  for (size_t i = 0; i + 1 < n; i++) {
318  int i2 = i + rng.rand_int (n - i);
319  std::swap(perm[i], perm[i2]);
320  }
321 }
322 
323 
324 
325 
326 void byte_rand (uint8_t * x, size_t n, long seed)
327 {
328  // only try to parallelize on large enough arrays
329  const size_t nblock = n < 1024 ? 1 : 1024;
330 
331  RandomGenerator rng0 (seed);
332  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
333 
334 #pragma omp parallel for
335  for (size_t j = 0; j < nblock; j++) {
336 
337  RandomGenerator rng (a0 + j * b0);
338 
339  const size_t istart = j * n / nblock;
340  const size_t iend = (j + 1) * n / nblock;
341 
342  size_t i;
343  for (i = istart; i < iend; i++)
344  x[i] = rng.rand_long ();
345  }
346 }
347 
348 
349 
350 void reflection (const float * __restrict u,
351  float * __restrict x,
352  size_t n, size_t d, size_t nu)
353 {
354  size_t i, j, l;
355  for (i = 0; i < n; i++) {
356  const float * up = u;
357  for (l = 0; l < nu; l++) {
358  float ip1 = 0, ip2 = 0;
359 
360  for (j = 0; j < d; j+=2) {
361  ip1 += up[j] * x[j];
362  ip2 += up[j+1] * x[j+1];
363  }
364  float ip = 2 * (ip1 + ip2);
365 
366  for (j = 0; j < d; j++)
367  x[j] -= ip * up[j];
368  up += d;
369  }
370  x += d;
371  }
372 }
373 
374 
375 /* Reference implementation (slower) */
376 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
377 {
378  size_t i, j, l;
379  for (i = 0; i < n; i++) {
380  const float * up = u;
381  for (l = 0; l < nu; l++) {
382  double ip = 0;
383 
384  for (j = 0; j < d; j++)
385  ip += up[j] * x[j];
386  ip *= 2;
387 
388  for (j = 0; j < d; j++)
389  x[j] -= ip * up[j];
390 
391  up += d;
392  }
393  x += d;
394  }
395 }
396 
397 /*********************************************************
398  * Optimized distance computations
399  *********************************************************/
400 
401 
402 
403 /* Functions to compute:
404  - L2 distance between 2 vectors
405  - inner product between 2 vectors
406  - L2 norm of a vector
407 
408  The functions should probably not be invoked when a large number of
409  vectors are be processed in batch (in which case Matrix multiply
410  is faster), but may be useful for comparing vectors isolated in
411  memory.
412 
413  Works with any vectors of any dimension, even unaligned (in which
414  case they are slower).
415 
416 */
417 
418 
419 /*********************************************************
420  * Reference implementations
421  */
422 
423 
424 
425 /* same without SSE */
426 float fvec_L2sqr_ref (const float * x,
427  const float * y,
428  size_t d)
429 {
430  size_t i;
431  float res_ = 0;
432  for (i = 0; i < d; i++) {
433  const float tmp = x[i] - y[i];
434  res_ += tmp * tmp;
435  }
436  return res_;
437 }
438 
439 float fvec_inner_product_ref (const float * x,
440  const float * y,
441  size_t d)
442 {
443  size_t i;
444  float res_ = 0;
445  for (i = 0; i < d; i++)
446  res_ += x[i] * y[i];
447  return res_;
448 }
449 
450 float fvec_norm_L2sqr_ref (const float * __restrict x,
451  size_t d)
452 {
453  size_t i;
454  double res_ = 0;
455  for (i = 0; i < d; i++)
456  res_ += x[i] * x[i];
457  return res_;
458 }
459 
460 
461 /*********************************************************
462  * SSE and AVX implementations
463  */
464 
465 // reads 0 <= d < 4 floats as __m128
466 static inline __m128 masked_read (int d, const float *x)
467 {
468  assert (0 <= d && d < 4);
469  __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
470  switch (d) {
471  case 3:
472  buf[2] = x[2];
473  case 2:
474  buf[1] = x[1];
475  case 1:
476  buf[0] = x[0];
477  }
478  return _mm_load_ps (buf);
479  // cannot use AVX2 _mm_mask_set1_epi32
480 }
481 
482 #ifdef USE_AVX
483 
484 // reads 0 <= d < 8 floats as __m256
485 static inline __m256 masked_read_8 (int d, const float *x)
486 {
487  assert (0 <= d && d < 8);
488  if (d < 4) {
489  __m256 res = _mm256_setzero_ps ();
490  res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
491  return res;
492  } else {
493  __m256 res = _mm256_setzero_ps ();
494  res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
495  res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
496  return res;
497  }
498 }
499 
500 float fvec_inner_product (const float * x,
501  const float * y,
502  size_t d)
503 {
504  __m256 msum1 = _mm256_setzero_ps();
505 
506  while (d >= 8) {
507  __m256 mx = _mm256_loadu_ps (x); x += 8;
508  __m256 my = _mm256_loadu_ps (y); y += 8;
509  msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
510  d -= 8;
511  }
512 
513  __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
514  msum2 += _mm256_extractf128_ps(msum1, 0);
515 
516  if (d >= 4) {
517  __m128 mx = _mm_loadu_ps (x); x += 4;
518  __m128 my = _mm_loadu_ps (y); y += 4;
519  msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
520  d -= 4;
521  }
522 
523  if (d > 0) {
524  __m128 mx = masked_read (d, x);
525  __m128 my = masked_read (d, y);
526  msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
527  }
528 
529  msum2 = _mm_hadd_ps (msum2, msum2);
530  msum2 = _mm_hadd_ps (msum2, msum2);
531  return _mm_cvtss_f32 (msum2);
532 }
533 
534 float fvec_L2sqr (const float * x,
535  const float * y,
536  size_t d)
537 {
538  __m256 msum1 = _mm256_setzero_ps();
539 
540  while (d >= 8) {
541  __m256 mx = _mm256_loadu_ps (x); x += 8;
542  __m256 my = _mm256_loadu_ps (y); y += 8;
543  const __m256 a_m_b1 = mx - my;
544  msum1 += a_m_b1 * a_m_b1;
545  d -= 8;
546  }
547 
548  __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
549  msum2 += _mm256_extractf128_ps(msum1, 0);
550 
551  if (d >= 4) {
552  __m128 mx = _mm_loadu_ps (x); x += 4;
553  __m128 my = _mm_loadu_ps (y); y += 4;
554  const __m128 a_m_b1 = mx - my;
555  msum2 += a_m_b1 * a_m_b1;
556  d -= 4;
557  }
558 
559  if (d > 0) {
560  __m128 mx = masked_read (d, x);
561  __m128 my = masked_read (d, y);
562  __m128 a_m_b1 = mx - my;
563  msum2 += a_m_b1 * a_m_b1;
564  }
565 
566  msum2 = _mm_hadd_ps (msum2, msum2);
567  msum2 = _mm_hadd_ps (msum2, msum2);
568  return _mm_cvtss_f32 (msum2);
569 }
570 
571 #else
572 
573 /* SSE-implementation of L2 distance */
574 float fvec_L2sqr (const float * x,
575  const float * y,
576  size_t d)
577 {
578  __m128 msum1 = _mm_setzero_ps();
579 
580  while (d >= 4) {
581  __m128 mx = _mm_loadu_ps (x); x += 4;
582  __m128 my = _mm_loadu_ps (y); y += 4;
583  const __m128 a_m_b1 = mx - my;
584  msum1 += a_m_b1 * a_m_b1;
585  d -= 4;
586  }
587 
588  if (d > 0) {
589  // add the last 1, 2 or 3 values
590  __m128 mx = masked_read (d, x);
591  __m128 my = masked_read (d, y);
592  __m128 a_m_b1 = mx - my;
593  msum1 += a_m_b1 * a_m_b1;
594  }
595 
596  msum1 = _mm_hadd_ps (msum1, msum1);
597  msum1 = _mm_hadd_ps (msum1, msum1);
598  return _mm_cvtss_f32 (msum1);
599 }
600 
601 
602 float fvec_inner_product (const float * x,
603  const float * y,
604  size_t d)
605 {
606  __m128 mx, my;
607  __m128 msum1 = _mm_setzero_ps();
608 
609  while (d >= 4) {
610  mx = _mm_loadu_ps (x); x += 4;
611  my = _mm_loadu_ps (y); y += 4;
612  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
613  d -= 4;
614  }
615 
616  // add the last 1, 2, or 3 values
617  mx = masked_read (d, x);
618  my = masked_read (d, y);
619  __m128 prod = _mm_mul_ps (mx, my);
620 
621  msum1 = _mm_add_ps (msum1, prod);
622 
623  msum1 = _mm_hadd_ps (msum1, msum1);
624  msum1 = _mm_hadd_ps (msum1, msum1);
625  return _mm_cvtss_f32 (msum1);
626 }
627 
628 
629 
630 #endif
631 
632 float fvec_norm_L2sqr (const float * x,
633  size_t d)
634 {
635  __m128 mx;
636  __m128 msum1 = _mm_setzero_ps();
637 
638  while (d >= 4) {
639  mx = _mm_loadu_ps (x); x += 4;
640  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
641  d -= 4;
642  }
643 
644  mx = masked_read (d, x);
645  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
646 
647  msum1 = _mm_hadd_ps (msum1, msum1);
648  msum1 = _mm_hadd_ps (msum1, msum1);
649  return _mm_cvtss_f32 (msum1);
650 }
651 
652 
653 
654 
655 /***************************************************************************
656  * Matrix/vector ops
657  ***************************************************************************/
658 
659 
660 
661 /* Compute the inner product between a vector x and
662  a set of ny vectors y.
663  These functions are not intended to replace BLAS matrix-matrix, as they
664  would be significantly less efficient in this case. */
665 void fvec_inner_products_ny (float * __restrict ip,
666  const float * x,
667  const float * y,
668  size_t d, size_t ny)
669 {
670  for (size_t i = 0; i < ny; i++) {
671  ip[i] = fvec_inner_product (x, y, d);
672  y += d;
673  }
674 }
675 
676 
677 
678 
679 /* compute ny L2 distances between x and a set of vectors y */
680 void fvec_L2sqr_ny (float * __restrict dis,
681  const float * x,
682  const float * y,
683  size_t d, size_t ny)
684 {
685  for (size_t i = 0; i < ny; i++) {
686  dis[i] = fvec_L2sqr (x, y, d);
687  y += d;
688  }
689 }
690 
691 
692 
693 
694 /* Compute the L2 norm of a set of nx vectors */
695 void fvec_norms_L2 (float * __restrict nr,
696  const float * __restrict x,
697  size_t d, size_t nx)
698 {
699 
700 #pragma omp parallel for
701  for (size_t i = 0; i < nx; i++) {
702  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
703  }
704 }
705 
706 void fvec_norms_L2sqr (float * __restrict nr,
707  const float * __restrict x,
708  size_t d, size_t nx)
709 {
710 #pragma omp parallel for
711  for (size_t i = 0; i < nx; i++)
712  nr[i] = fvec_norm_L2sqr (x + i * d, d);
713 }
714 
715 
716 
717 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
718 {
719 #pragma omp parallel for
720  for (size_t i = 0; i < nx; i++) {
721  float * __restrict xi = x + i * d;
722 
723  float nr = fvec_norm_L2sqr (xi, d);
724 
725  if (nr > 0) {
726  size_t j;
727  const float inv_nr = 1.0 / sqrtf (nr);
728  for (j = 0; j < d; j++)
729  xi[j] *= inv_nr;
730  }
731  }
732 }
733 
734 
735 
736 
737 
738 
739 
740 
741 
742 
743 
744 
745 
746 
747 
748 
749 
750 /***************************************************************************
751  * KNN functions
752  ***************************************************************************/
753 
754 
755 
756 /* Find the nearest neighbors for nx queries in a set of ny vectors */
757 static void knn_inner_product_sse (const float * x,
758  const float * y,
759  size_t d, size_t nx, size_t ny,
760  float_minheap_array_t * res)
761 {
762  size_t k = res->k;
763 
764 #pragma omp parallel for
765  for (size_t i = 0; i < nx; i++) {
766  const float * x_ = x + i * d;
767  const float * y_ = y;
768 
769  float * __restrict simi = res->get_val(i);
770  long * __restrict idxi = res->get_ids (i);
771 
772  minheap_heapify (k, simi, idxi);
773 
774  for (size_t j = 0; j < ny; j++) {
775  float ip = fvec_inner_product (x_, y_, d);
776 
777  if (ip > simi[0]) {
778  minheap_pop (k, simi, idxi);
779  minheap_push (k, simi, idxi, ip, j);
780  }
781  y_ += d;
782  }
783  minheap_reorder (k, simi, idxi);
784  }
785 
786 }
787 
788 static void knn_L2sqr_sse (
789  const float * x,
790  const float * y,
791  size_t d, size_t nx, size_t ny,
792  float_maxheap_array_t * res)
793 {
794  size_t k = res->k;
795 
796 #pragma omp parallel for
797  for (size_t i = 0; i < nx; i++) {
798  const float * x_ = x + i * d;
799  const float * y_ = y;
800  size_t j;
801  float * __restrict simi = res->get_val(i);
802  long * __restrict idxi = res->get_ids (i);
803 
804  maxheap_heapify (k, simi, idxi);
805  for (j = 0; j < ny; j++) {
806  float disij = fvec_L2sqr (x_, y_, d);
807 
808  if (disij < simi[0]) {
809  maxheap_pop (k, simi, idxi);
810  maxheap_push (k, simi, idxi, disij, j);
811  }
812  y_ += d;
813  }
814  maxheap_reorder (k, simi, idxi);
815  }
816 
817 }
818 
819 
820 /** Find the nearest neighbors for nx queries in a set of ny vectors */
821 static void knn_inner_product_blas (
822  const float * x,
823  const float * y,
824  size_t d, size_t nx, size_t ny,
825  float_minheap_array_t * res)
826 {
827  res->heapify ();
828 
829  // BLAS does not like empty matrices
830  if (nx == 0 || ny == 0) return;
831 
832  /* block sizes */
833  const size_t bs_x = 4096, bs_y = 1024;
834  // const size_t bs_x = 16, bs_y = 16;
835  float *ip_block = new float[bs_x * bs_y];
836 
837  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
838  size_t i1 = i0 + bs_x;
839  if(i1 > nx) i1 = nx;
840 
841  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
842  size_t j1 = j0 + bs_y;
843  if (j1 > ny) j1 = ny;
844  /* compute the actual dot products */
845  {
846  float one = 1, zero = 0;
847  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
848  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
849  y + j0 * d, &di,
850  x + i0 * d, &di, &zero,
851  ip_block, &nyi);
852  }
853 
854  /* collect maxima */
855  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
856  }
857  }
858  delete [] ip_block;
859  res->reorder ();
860 }
861 
862 // distance correction is an operator that can be applied to transform
863 // the distances
864 template<class DistanceCorrection>
865 static void knn_L2sqr_blas (const float * x,
866  const float * y,
867  size_t d, size_t nx, size_t ny,
868  float_maxheap_array_t * res,
869  const DistanceCorrection &corr)
870 {
871  res->heapify ();
872 
873  // BLAS does not like empty matrices
874  if (nx == 0 || ny == 0) return;
875 
876  size_t k = res->k;
877 
878  /* block sizes */
879  const size_t bs_x = 4096, bs_y = 1024;
880  // const size_t bs_x = 16, bs_y = 16;
881  float *ip_block = new float[bs_x * bs_y];
882 
883  float *x_norms = new float[nx];
884  fvec_norms_L2sqr (x_norms, x, d, nx);
885 
886  float *y_norms = new float[ny];
887  fvec_norms_L2sqr (y_norms, y, d, ny);
888 
889  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
890  size_t i1 = i0 + bs_x;
891  if(i1 > nx) i1 = nx;
892 
893  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
894  size_t j1 = j0 + bs_y;
895  if (j1 > ny) j1 = ny;
896  /* compute the actual dot products */
897  {
898  float one = 1, zero = 0;
899  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
900  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
901  y + j0 * d, &di,
902  x + i0 * d, &di, &zero,
903  ip_block, &nyi);
904  }
905 
906  /* collect minima */
907 #pragma omp parallel for
908  for (size_t i = i0; i < i1; i++) {
909  float * __restrict simi = res->get_val(i);
910  long * __restrict idxi = res->get_ids (i);
911  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
912 
913  for (size_t j = j0; j < j1; j++) {
914  float ip = *ip_line++;
915  float dis = x_norms[i] + y_norms[j] - 2 * ip;
916 
917  dis = corr (dis, i, j);
918 
919  if (dis < simi[0]) {
920  maxheap_pop (k, simi, idxi);
921  maxheap_push (k, simi, idxi, dis, j);
922  }
923  }
924  }
925  }
926  }
927  res->reorder ();
928 
929  delete [] ip_block;
930  delete [] x_norms;
931  delete [] y_norms;
932 }
933 
934 
935 
936 
937 
938 
939 
940 
941 
942 /*******************************************************
943  * KNN driver functions
944  *******************************************************/
945 
946 void knn_inner_product (const float * x,
947  const float * y,
948  size_t d, size_t nx, size_t ny,
949  float_minheap_array_t * res)
950 {
951  if (d % 4 == 0 && nx < 20) {
952  knn_inner_product_sse (x, y, d, nx, ny, res);
953  } else {
954  knn_inner_product_blas (x, y, d, nx, ny, res);
955  }
956 }
957 
958 
959 
961  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
962  return dis;
963  }
964 };
965 
966 void knn_L2sqr (const float * x,
967  const float * y,
968  size_t d, size_t nx, size_t ny,
969  float_maxheap_array_t * res)
970 {
971  if (d % 4 == 0 && nx < 20) {
972  knn_L2sqr_sse (x, y, d, nx, ny, res);
973  } else {
975  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
976  }
977 }
978 
980  const float *base_shift;
981  float operator()(float dis, size_t /*qno*/, size_t bno) const {
982  return dis - base_shift[bno];
983  }
984 };
985 
987  const float * x,
988  const float * y,
989  size_t d, size_t nx, size_t ny,
990  float_maxheap_array_t * res,
991  const float *base_shift)
992 {
993  BaseShiftDistanceCorrection corr = {base_shift};
994  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
995 }
996 
997 
998 
999 /***************************************************************************
1000  * compute a subset of distances
1001  ***************************************************************************/
1002 
1003 /* compute the inner product between x and a subset y of ny vectors,
1004  whose indices are given by idy. */
1005 void fvec_inner_products_by_idx (float * __restrict ip,
1006  const float * x,
1007  const float * y,
1008  const long * __restrict ids, /* for y vecs */
1009  size_t d, size_t nx, size_t ny)
1010 {
1011 #pragma omp parallel for
1012  for (size_t j = 0; j < nx; j++) {
1013  const long * __restrict idsj = ids + j * ny;
1014  const float * xj = x + j * d;
1015  float * __restrict ipj = ip + j * ny;
1016  for (size_t i = 0; i < ny; i++) {
1017  if (idsj[i] < 0)
1018  continue;
1019  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
1020  }
1021  }
1022 }
1023 
1024 /* compute the inner product between x and a subset y of ny vectors,
1025  whose indices are given by idy. */
1026 void fvec_L2sqr_by_idx (float * __restrict dis,
1027  const float * x,
1028  const float * y,
1029  const long * __restrict ids, /* ids of y vecs */
1030  size_t d, size_t nx, size_t ny)
1031 {
1032 #pragma omp parallel for
1033  for (size_t j = 0; j < nx; j++) {
1034  const long * __restrict idsj = ids + j * ny;
1035  const float * xj = x + j * d;
1036  float * __restrict disj = dis + j * ny;
1037  for (size_t i = 0; i < ny; i++) {
1038  if (idsj[i] < 0)
1039  continue;
1040  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
1041  }
1042  }
1043 }
1044 
1045 
1046 
1047 
1048 
1049 /* Find the nearest neighbors for nx queries in a set of ny vectors
1050  indexed by ids. May be useful for re-ranking a pre-selected vector list */
1051 void knn_inner_products_by_idx (const float * x,
1052  const float * y,
1053  const long * ids,
1054  size_t d, size_t nx, size_t ny,
1055  float_minheap_array_t * res)
1056 {
1057  size_t k = res->k;
1058 
1059 #pragma omp parallel for
1060  for (size_t i = 0; i < nx; i++) {
1061  const float * x_ = x + i * d;
1062  const long * idsi = ids + i * ny;
1063  size_t j;
1064  float * __restrict simi = res->get_val(i);
1065  long * __restrict idxi = res->get_ids (i);
1066  minheap_heapify (k, simi, idxi);
1067 
1068  for (j = 0; j < ny; j++) {
1069  if (idsi[j] < 0) break;
1070  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
1071 
1072  if (ip > simi[0]) {
1073  minheap_pop (k, simi, idxi);
1074  minheap_push (k, simi, idxi, ip, idsi[j]);
1075  }
1076  }
1077  minheap_reorder (k, simi, idxi);
1078  }
1079 
1080 }
1081 
1082 void knn_L2sqr_by_idx (const float * x,
1083  const float * y,
1084  const long * __restrict ids,
1085  size_t d, size_t nx, size_t ny,
1086  float_maxheap_array_t * res)
1087 {
1088  size_t k = res->k;
1089 
1090 #pragma omp parallel for
1091  for (size_t i = 0; i < nx; i++) {
1092  const float * x_ = x + i * d;
1093  const long * __restrict idsi = ids + i * ny;
1094  float * __restrict simi = res->get_val(i);
1095  long * __restrict idxi = res->get_ids (i);
1096  maxheap_heapify (res->k, simi, idxi);
1097  for (size_t j = 0; j < ny; j++) {
1098  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
1099 
1100  if (disij < simi[0]) {
1101  maxheap_pop (k, simi, idxi);
1102  maxheap_push (k, simi, idxi, disij, idsi[j]);
1103  }
1104  }
1105  maxheap_reorder (res->k, simi, idxi);
1106  }
1107 
1108 }
1109 
1110 
1111 
1112 
1113 
1114 /***************************************************************************
1115  * Range search
1116  ***************************************************************************/
1117 
1118 /** Find the nearest neighbors for nx queries in a set of ny vectors
1119  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1120  */
1121 template <bool compute_l2>
1122 static void range_search_blas (
1123  const float * x,
1124  const float * y,
1125  size_t d, size_t nx, size_t ny,
1126  float radius,
1127  RangeSearchResult *result)
1128 {
1129 
1130  // BLAS does not like empty matrices
1131  if (nx == 0 || ny == 0) return;
1132 
1133  /* block sizes */
1134  const size_t bs_x = 4096, bs_y = 1024;
1135  // const size_t bs_x = 16, bs_y = 16;
1136  float *ip_block = new float[bs_x * bs_y];
1137 
1138  float *x_norms = nullptr, *y_norms = nullptr;
1139 
1140  if (compute_l2) {
1141  x_norms = new float[nx];
1142  fvec_norms_L2sqr (x_norms, x, d, nx);
1143  y_norms = new float[ny];
1144  fvec_norms_L2sqr (y_norms, y, d, ny);
1145  }
1146 
1147  std::vector <RangeSearchPartialResult *> partial_results;
1148 
1149  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1150  size_t j1 = j0 + bs_y;
1151  if (j1 > ny) j1 = ny;
1152  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1153  partial_results.push_back (pres);
1154 
1155  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1156  size_t i1 = i0 + bs_x;
1157  if(i1 > nx) i1 = nx;
1158 
1159  /* compute the actual dot products */
1160  {
1161  float one = 1, zero = 0;
1162  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1163  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1164  y + j0 * d, &di,
1165  x + i0 * d, &di, &zero,
1166  ip_block, &nyi);
1167  }
1168 
1169 
1170  for (size_t i = i0; i < i1; i++) {
1171  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1172 
1173  RangeSearchPartialResult::QueryResult & qres =
1174  pres->new_result (i);
1175 
1176  for (size_t j = j0; j < j1; j++) {
1177  float ip = *ip_line++;
1178  if (compute_l2) {
1179  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1180  if (dis < radius) {
1181  qres.add (dis, j);
1182  }
1183  } else {
1184  if (ip > radius) {
1185  qres.add (ip, j);
1186  }
1187  }
1188  }
1189  }
1190  }
1191 
1192  }
1193  delete [] ip_block;
1194  delete [] x_norms;
1195  delete [] y_norms;
1196 
1197  { // merge the partial results
1198  int npres = partial_results.size();
1199  // count
1200  for (size_t i = 0; i < nx; i++) {
1201  for (int j = 0; j < npres; j++)
1202  result->lims[i] += partial_results[j]->queries[i].nres;
1203  }
1204  result->do_allocation ();
1205  for (int j = 0; j < npres; j++) {
1206  partial_results[j]->set_result (true);
1207  delete partial_results[j];
1208  }
1209 
1210  // reset the limits
1211  for (size_t i = nx; i > 0; i--) {
1212  result->lims [i] = result->lims [i - 1];
1213  }
1214  result->lims [0] = 0;
1215  }
1216 }
1217 
1218 
1219 template <bool compute_l2>
1220 static void range_search_sse (const float * x,
1221  const float * y,
1222  size_t d, size_t nx, size_t ny,
1223  float radius,
1224  RangeSearchResult *res)
1225 {
1226  FAISS_THROW_IF_NOT (d % 4 == 0);
1227 
1228 #pragma omp parallel
1229  {
1230  RangeSearchPartialResult pres (res);
1231 
1232 #pragma omp for
1233  for (size_t i = 0; i < nx; i++) {
1234  const float * x_ = x + i * d;
1235  const float * y_ = y;
1236  size_t j;
1237 
1238  RangeSearchPartialResult::QueryResult & qres =
1239  pres.new_result (i);
1240 
1241  for (j = 0; j < ny; j++) {
1242  if (compute_l2) {
1243  float disij = fvec_L2sqr (x_, y_, d);
1244  if (disij < radius) {
1245  qres.add (disij, j);
1246  }
1247  } else {
1248  float ip = fvec_inner_product (x_, y_, d);
1249  if (ip > radius) {
1250  qres.add (ip, j);
1251  }
1252  }
1253  y_ += d;
1254  }
1255 
1256  }
1257  pres.finalize ();
1258  }
1259 }
1260 
1261 
1262 
1263 
1264 
1266  const float * x,
1267  const float * y,
1268  size_t d, size_t nx, size_t ny,
1269  float radius,
1270  RangeSearchResult *res)
1271 {
1272 
1273  if (d % 4 == 0 && nx < 20) {
1274  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1275  } else {
1276  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1277  }
1278 }
1279 
1281  const float * x,
1282  const float * y,
1283  size_t d, size_t nx, size_t ny,
1284  float radius,
1285  RangeSearchResult *res)
1286 {
1287 
1288  if (d % 4 == 0 && nx < 20) {
1289  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1290  } else {
1291  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1292  }
1293 }
1294 
1295 
1296 
1297 /***************************************************************************
1298  * Some matrix manipulation functions
1299  ***************************************************************************/
1300 
1301 
1302 /* This function exists because the Torch counterpart is extremly slow
1303  (not multi-threaded + unexpected overhead even in single thread).
1304  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1305 void inner_product_to_L2sqr (float * __restrict dis,
1306  const float * nr1,
1307  const float * nr2,
1308  size_t n1, size_t n2)
1309 {
1310 
1311 #pragma omp parallel for
1312  for (size_t j = 0 ; j < n1 ; j++) {
1313  float * disj = dis + j * n2;
1314  for (size_t i = 0 ; i < n2 ; i++)
1315  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1316  }
1317 }
1318 
1319 
1320 void matrix_qr (int m, int n, float *a)
1321 {
1322  FAISS_THROW_IF_NOT (m >= n);
1323  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1324  std::vector<float> tau (ki);
1325  FINTEGER lwork = -1, info;
1326  float work_size;
1327 
1328  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1329  &work_size, &lwork, &info);
1330  lwork = size_t(work_size);
1331  std::vector<float> work (lwork);
1332 
1333  sgeqrf_ (&mi, &ni, a, &mi,
1334  tau.data(), work.data(), &lwork, &info);
1335 
1336  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1337  work.data(), &lwork, &info);
1338 
1339 }
1340 
1341 
1342 void pairwise_L2sqr (long d,
1343  long nq, const float *xq,
1344  long nb, const float *xb,
1345  float *dis,
1346  long ldq, long ldb, long ldd)
1347 {
1348  if (nq == 0 || nb == 0) return;
1349  if (ldq == -1) ldq = d;
1350  if (ldb == -1) ldb = d;
1351  if (ldd == -1) ldd = nb;
1352 
1353  // store in beginning of distance matrix to avoid malloc
1354  float *b_norms = dis;
1355 
1356 #pragma omp parallel for
1357  for (long i = 0; i < nb; i++)
1358  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1359 
1360 #pragma omp parallel for
1361  for (long i = 1; i < nq; i++) {
1362  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1363  for (long j = 0; j < nb; j++)
1364  dis[i * ldd + j] = q_norm + b_norms [j];
1365  }
1366 
1367  {
1368  float q_norm = fvec_norm_L2sqr (xq, d);
1369  for (long j = 0; j < nb; j++)
1370  dis[j] += q_norm;
1371  }
1372 
1373  {
1374  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1375  float one = 1.0, minus_2 = -2.0;
1376 
1377  sgemm_ ("Transposed", "Not transposed",
1378  &nbi, &nqi, &di,
1379  &minus_2,
1380  xb, &ldbi,
1381  xq, &ldqi,
1382  &one, dis, &lddi);
1383  }
1384 
1385 
1386 }
1387 
1388 
1389 
1390 /***************************************************************************
1391  * Kmeans subroutine
1392  ***************************************************************************/
1393 
1394 // a bit above machine epsilon for float16
1395 
1396 #define EPS (1 / 1024.)
1397 
1398 /* For k-means, compute centroids given assignment of vectors to centroids */
1399 int km_update_centroids (const float * x,
1400  float * centroids,
1401  long * assign,
1402  size_t d, size_t k, size_t n,
1403  size_t k_frozen)
1404 {
1405  k -= k_frozen;
1406  centroids += k_frozen * d;
1407 
1408  std::vector<size_t> hassign(k);
1409  memset (centroids, 0, sizeof(*centroids) * d * k);
1410 
1411 #pragma omp parallel
1412  {
1413  int nt = omp_get_num_threads();
1414  int rank = omp_get_thread_num();
1415  // this thread is taking care of centroids c0:c1
1416  size_t c0 = (k * rank) / nt;
1417  size_t c1 = (k * (rank + 1)) / nt;
1418  const float *xi = x;
1419  size_t nacc = 0;
1420 
1421  for (size_t i = 0; i < n; i++) {
1422  long ci = assign[i];
1423  assert (ci >= 0 && ci < k + k_frozen);
1424  ci -= k_frozen;
1425  if (ci >= c0 && ci < c1) {
1426  float * c = centroids + ci * d;
1427  hassign[ci]++;
1428  for (size_t j = 0; j < d; j++)
1429  c[j] += xi[j];
1430  nacc++;
1431  }
1432  xi += d;
1433  }
1434 
1435  }
1436 
1437 #pragma omp parallel for
1438  for (size_t ci = 0; ci < k; ci++) {
1439  float * c = centroids + ci * d;
1440  float ni = (float) hassign[ci];
1441  if (ni != 0) {
1442  for (size_t j = 0; j < d; j++)
1443  c[j] /= ni;
1444  }
1445  }
1446 
1447  /* Take care of void clusters */
1448  size_t nsplit = 0;
1449  RandomGenerator rng (1234);
1450  for (size_t ci = 0; ci < k; ci++) {
1451  if (hassign[ci] == 0) { /* need to redefine a centroid */
1452  size_t cj;
1453  for (cj = 0; 1; cj = (cj + 1) % k) {
1454  /* probability to pick this cluster for split */
1455  float p = (hassign[cj] - 1.0) / (float) (n - k);
1456  float r = rng.rand_float ();
1457  if (r < p) {
1458  break; /* found our cluster to be split */
1459  }
1460  }
1461  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1462 
1463  /* small symmetric pertubation. Much better than */
1464  for (size_t j = 0; j < d; j++) {
1465  if (j % 2 == 0) {
1466  centroids[ci * d + j] *= 1 + EPS;
1467  centroids[cj * d + j] *= 1 - EPS;
1468  } else {
1469  centroids[ci * d + j] *= 1 - EPS;
1470  centroids[cj * d + j] *= 1 + EPS;
1471  }
1472  }
1473 
1474  /* assume even split of the cluster */
1475  hassign[ci] = hassign[cj] / 2;
1476  hassign[cj] -= hassign[ci];
1477  nsplit++;
1478  }
1479  }
1480 
1481  return nsplit;
1482 }
1483 
1484 #undef EPS
1485 
1486 
1487 
1488 /***************************************************************************
1489  * Result list routines
1490  ***************************************************************************/
1491 
1492 
1493 void ranklist_handle_ties (int k, long *idx, const float *dis)
1494 {
1495  float prev_dis = -1e38;
1496  int prev_i = -1;
1497  for (int i = 0; i < k; i++) {
1498  if (dis[i] != prev_dis) {
1499  if (i > prev_i + 1) {
1500  // sort between prev_i and i - 1
1501  std::sort (idx + prev_i, idx + i);
1502  }
1503  prev_i = i;
1504  prev_dis = dis[i];
1505  }
1506  }
1507 }
1508 
1509 size_t merge_result_table_with (size_t n, size_t k,
1510  long *I0, float *D0,
1511  const long *I1, const float *D1,
1512  bool keep_min,
1513  long translation)
1514 {
1515  size_t n1 = 0;
1516 
1517 #pragma omp parallel reduction(+:n1)
1518  {
1519  std::vector<long> tmpI (k);
1520  std::vector<float> tmpD (k);
1521 
1522 #pragma omp for
1523  for (size_t i = 0; i < n; i++) {
1524  long *lI0 = I0 + i * k;
1525  float *lD0 = D0 + i * k;
1526  const long *lI1 = I1 + i * k;
1527  const float *lD1 = D1 + i * k;
1528  size_t r0 = 0;
1529  size_t r1 = 0;
1530 
1531  if (keep_min) {
1532  for (size_t j = 0; j < k; j++) {
1533 
1534  if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1535  tmpD[j] = lD0[r0];
1536  tmpI[j] = lI0[r0];
1537  r0++;
1538  } else if (lD1[r1] >= 0) {
1539  tmpD[j] = lD1[r1];
1540  tmpI[j] = lI1[r1] + translation;
1541  r1++;
1542  } else { // both are NaNs
1543  tmpD[j] = NAN;
1544  tmpI[j] = -1;
1545  }
1546  }
1547  } else {
1548  for (size_t j = 0; j < k; j++) {
1549  if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1550  tmpD[j] = lD0[r0];
1551  tmpI[j] = lI0[r0];
1552  r0++;
1553  } else if (lD1[r1] >= 0) {
1554  tmpD[j] = lD1[r1];
1555  tmpI[j] = lI1[r1] + translation;
1556  r1++;
1557  } else { // both are NaNs
1558  tmpD[j] = NAN;
1559  tmpI[j] = -1;
1560  }
1561  }
1562  }
1563  n1 += r1;
1564  memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
1565  memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
1566  }
1567  }
1568 
1569  return n1;
1570 }
1571 
1572 
1573 
1574 size_t ranklist_intersection_size (size_t k1, const long *v1,
1575  size_t k2, const long *v2_in)
1576 {
1577  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1578  long *v2 = new long [k2];
1579  memcpy (v2, v2_in, sizeof (long) * k2);
1580  std::sort (v2, v2 + k2);
1581  { // de-dup v2
1582  long prev = -1;
1583  size_t wp = 0;
1584  for (size_t i = 0; i < k2; i++) {
1585  if (v2 [i] != prev) {
1586  v2[wp++] = prev = v2 [i];
1587  }
1588  }
1589  k2 = wp;
1590  }
1591  const long seen_flag = 1L << 60;
1592  size_t count = 0;
1593  for (size_t i = 0; i < k1; i++) {
1594  long q = v1 [i];
1595  size_t i0 = 0, i1 = k2;
1596  while (i0 + 1 < i1) {
1597  size_t imed = (i1 + i0) / 2;
1598  long piv = v2 [imed] & ~seen_flag;
1599  if (piv <= q) i0 = imed;
1600  else i1 = imed;
1601  }
1602  if (v2 [i0] == q) {
1603  count++;
1604  v2 [i0] |= seen_flag;
1605  }
1606  }
1607  delete [] v2;
1608 
1609  return count;
1610 }
1611 
1612 double imbalance_factor (int k, const int *hist) {
1613  double tot = 0, uf = 0;
1614 
1615  for (int i = 0 ; i < k ; i++) {
1616  tot += hist[i];
1617  uf += hist[i] * (double) hist[i];
1618  }
1619  uf = uf * k / (tot * tot);
1620 
1621  return uf;
1622 }
1623 
1624 
1625 double imbalance_factor (int n, int k, const long *assign) {
1626  std::vector<int> hist(k, 0);
1627  for (int i = 0; i < n; i++) {
1628  hist[assign[i]]++;
1629  }
1630 
1631  return imbalance_factor (k, hist.data());
1632 }
1633 
1634 
1635 
1636 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1637  memset (hist, 0, sizeof(hist[0]) * vmax);
1638  int nout = 0;
1639  while (n--) {
1640  if (v[n] < 0 || v[n] >= vmax) nout++;
1641  else hist[v[n]]++;
1642  }
1643  return nout;
1644 }
1645 
1646 
1647 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1648 {
1649  FAISS_THROW_IF_NOT (nbits % 8 == 0);
1650  size_t d = nbits / 8;
1651  std::vector<int> accu(d * 256);
1652  const uint8_t *c = codes;
1653  for (size_t i = 0; i < n; i++)
1654  for(int j = 0; j < d; j++)
1655  accu[j * 256 + *c++]++;
1656  memset (hist, 0, sizeof(*hist) * nbits);
1657  for (int i = 0; i < d; i++) {
1658  const int *ai = accu.data() + i * 256;
1659  int * hi = hist + i * 8;
1660  for (int j = 0; j < 256; j++)
1661  for (int k = 0; k < 8; k++)
1662  if ((j >> k) & 1)
1663  hi[k] += ai[j];
1664  }
1665 
1666 }
1667 
1668 
1669 
1670 size_t ivec_checksum (size_t n, const int *a)
1671 {
1672  size_t cs = 112909;
1673  while (n--) cs = cs * 65713 + a[n] * 1686049;
1674  return cs;
1675 }
1676 
1677 
1678 namespace {
1679  struct ArgsortComparator {
1680  const float *vals;
1681  bool operator() (const size_t a, const size_t b) const {
1682  return vals[a] < vals[b];
1683  }
1684  };
1685 
1686  struct SegmentS {
1687  size_t i0; // begin pointer in the permutation array
1688  size_t i1; // end
1689  size_t len() const {
1690  return i1 - i0;
1691  }
1692  };
1693 
1694  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1695  // extended to > 1 merge thread
1696 
1697  // merges 2 ranges that should be consecutive on the source into
1698  // the union of the two on the destination
1699  template<typename T>
1700  void parallel_merge (const T *src, T *dst,
1701  SegmentS &s1, SegmentS & s2, int nt,
1702  const ArgsortComparator & comp) {
1703  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1704  std::swap(s1, s2);
1705  }
1706 
1707  // compute sub-ranges for each thread
1708  SegmentS s1s[nt], s2s[nt], sws[nt];
1709  s2s[0].i0 = s2.i0;
1710  s2s[nt - 1].i1 = s2.i1;
1711 
1712  // not sure parallel actually helps here
1713 #pragma omp parallel for num_threads(nt)
1714  for (int t = 0; t < nt; t++) {
1715  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1716  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1717 
1718  if (t + 1 < nt) {
1719  T pivot = src[s1s[t].i1];
1720  size_t i0 = s2.i0, i1 = s2.i1;
1721  while (i0 + 1 < i1) {
1722  size_t imed = (i1 + i0) / 2;
1723  if (comp (pivot, src[imed])) {i1 = imed; }
1724  else {i0 = imed; }
1725  }
1726  s2s[t].i1 = s2s[t + 1].i0 = i1;
1727  }
1728  }
1729  s1.i0 = std::min(s1.i0, s2.i0);
1730  s1.i1 = std::max(s1.i1, s2.i1);
1731  s2 = s1;
1732  sws[0].i0 = s1.i0;
1733  for (int t = 0; t < nt; t++) {
1734  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1735  if (t + 1 < nt) {
1736  sws[t + 1].i0 = sws[t].i1;
1737  }
1738  }
1739  assert(sws[nt - 1].i1 == s1.i1);
1740 
1741  // do the actual merging
1742 #pragma omp parallel for num_threads(nt)
1743  for (int t = 0; t < nt; t++) {
1744  SegmentS sw = sws[t];
1745  SegmentS s1t = s1s[t];
1746  SegmentS s2t = s2s[t];
1747  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1748  for (;;) {
1749  // assert (sw.len() == s1t.len() + s2t.len());
1750  if (comp(src[s1t.i0], src[s2t.i0])) {
1751  dst[sw.i0++] = src[s1t.i0++];
1752  if (s1t.i0 == s1t.i1) break;
1753  } else {
1754  dst[sw.i0++] = src[s2t.i0++];
1755  if (s2t.i0 == s2t.i1) break;
1756  }
1757  }
1758  }
1759  if (s1t.len() > 0) {
1760  assert(s1t.len() == sw.len());
1761  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1762  } else if (s2t.len() > 0) {
1763  assert(s2t.len() == sw.len());
1764  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1765  }
1766  }
1767  }
1768 
1769 };
1770 
1771 void fvec_argsort (size_t n, const float *vals,
1772  size_t *perm)
1773 {
1774  for (size_t i = 0; i < n; i++) perm[i] = i;
1775  ArgsortComparator comp = {vals};
1776  std::sort (perm, perm + n, comp);
1777 }
1778 
1779 void fvec_argsort_parallel (size_t n, const float *vals,
1780  size_t *perm)
1781 {
1782  size_t * perm2 = new size_t[n];
1783  // 2 result tables, during merging, flip between them
1784  size_t *permB = perm2, *permA = perm;
1785 
1786  int nt = omp_get_max_threads();
1787  { // prepare correct permutation so that the result ends in perm
1788  // at final iteration
1789  int nseg = nt;
1790  while (nseg > 1) {
1791  nseg = (nseg + 1) / 2;
1792  std::swap (permA, permB);
1793  }
1794  }
1795 
1796 #pragma omp parallel
1797  for (size_t i = 0; i < n; i++) permA[i] = i;
1798 
1799  ArgsortComparator comp = {vals};
1800 
1801  SegmentS segs[nt];
1802 
1803  // independent sorts
1804 #pragma omp parallel for
1805  for (int t = 0; t < nt; t++) {
1806  size_t i0 = t * n / nt;
1807  size_t i1 = (t + 1) * n / nt;
1808  SegmentS seg = {i0, i1};
1809  std::sort (permA + seg.i0, permA + seg.i1, comp);
1810  segs[t] = seg;
1811  }
1812  int prev_nested = omp_get_nested();
1813  omp_set_nested(1);
1814 
1815  int nseg = nt;
1816  while (nseg > 1) {
1817  int nseg1 = (nseg + 1) / 2;
1818  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1819  int sub_nseg1 = nseg / 2;
1820 
1821 #pragma omp parallel for num_threads(nseg1)
1822  for (int s = 0; s < nseg; s += 2) {
1823  if (s + 1 == nseg) { // otherwise isolated segment
1824  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1825  segs[s].len() * sizeof(size_t));
1826  } else {
1827  int t0 = s * sub_nt / sub_nseg1;
1828  int t1 = (s + 1) * sub_nt / sub_nseg1;
1829  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1830  parallel_merge(permA, permB, segs[s], segs[s + 1],
1831  t1 - t0, comp);
1832  }
1833  }
1834  for (int s = 0; s < nseg; s += 2)
1835  segs[s / 2] = segs[s];
1836  nseg = nseg1;
1837  std::swap (permA, permB);
1838  }
1839  assert (permA == perm);
1840  omp_set_nested(prev_nested);
1841  delete [] perm2;
1842 }
1843 
1844 
1845 
1846 
1847 
1848 
1849 
1850 
1851 
1852 
1853 
1854 
1855 
1856 
1857 
1858 
1859 /***************************************************************************
1860  * heavily optimized table computations
1861  ***************************************************************************/
1862 
1863 
1864 static inline void fvec_madd_ref (size_t n, const float *a,
1865  float bf, const float *b, float *c) {
1866  for (size_t i = 0; i < n; i++)
1867  c[i] = a[i] + bf * b[i];
1868 }
1869 
1870 
1871 static inline void fvec_madd_sse (size_t n, const float *a,
1872  float bf, const float *b, float *c) {
1873  n >>= 2;
1874  __m128 bf4 = _mm_set_ps1 (bf);
1875  __m128 * a4 = (__m128*)a;
1876  __m128 * b4 = (__m128*)b;
1877  __m128 * c4 = (__m128*)c;
1878 
1879  while (n--) {
1880  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1881  b4++;
1882  a4++;
1883  c4++;
1884  }
1885 }
1886 
1887 void fvec_madd (size_t n, const float *a,
1888  float bf, const float *b, float *c)
1889 {
1890  if ((n & 3) == 0 &&
1891  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1892  fvec_madd_sse (n, a, bf, b, c);
1893  else
1894  fvec_madd_ref (n, a, bf, b, c);
1895 }
1896 
1897 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1898  float bf, const float *b, float *c) {
1899  float vmin = 1e20;
1900  int imin = -1;
1901 
1902  for (size_t i = 0; i < n; i++) {
1903  c[i] = a[i] + bf * b[i];
1904  if (c[i] < vmin) {
1905  vmin = c[i];
1906  imin = i;
1907  }
1908  }
1909  return imin;
1910 }
1911 
1912 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1913  float bf, const float *b, float *c) {
1914  n >>= 2;
1915  __m128 bf4 = _mm_set_ps1 (bf);
1916  __m128 vmin4 = _mm_set_ps1 (1e20);
1917  __m128i imin4 = _mm_set1_epi32 (-1);
1918  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1919  __m128i inc4 = _mm_set1_epi32 (4);
1920  __m128 * a4 = (__m128*)a;
1921  __m128 * b4 = (__m128*)b;
1922  __m128 * c4 = (__m128*)c;
1923 
1924  while (n--) {
1925  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1926  *c4 = vc4;
1927  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1928  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1929 
1930  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1931  _mm_andnot_si128 (mask, imin4));
1932  vmin4 = _mm_min_ps (vmin4, vc4);
1933  b4++;
1934  a4++;
1935  c4++;
1936  idx4 = _mm_add_epi32 (idx4, inc4);
1937  }
1938 
1939  // 4 values -> 2
1940  {
1941  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1942  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1943  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1944  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1945  _mm_andnot_si128 (mask, imin4));
1946  vmin4 = _mm_min_ps (vmin4, vc4);
1947  }
1948  // 2 values -> 1
1949  {
1950  idx4 = _mm_shuffle_epi32 (imin4, 1);
1951  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1952  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1953  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1954  _mm_andnot_si128 (mask, imin4));
1955  // vmin4 = _mm_min_ps (vmin4, vc4);
1956  }
1957  return _mm_extract_epi32 (imin4, 0);
1958 }
1959 
1960 
1961 int fvec_madd_and_argmin (size_t n, const float *a,
1962  float bf, const float *b, float *c)
1963 {
1964  if ((n & 3) == 0 &&
1965  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1966  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1967  else
1968  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1969 }
1970 
1971 
1972 
1974  size_t d, size_t *n, size_t nmax, const float *x,
1975  bool verbose, long seed)
1976 {
1977 
1978  if (*n <= nmax) return x; // nothing to do
1979 
1980  size_t n2 = nmax;
1981  if (verbose) {
1982  printf (" Input training set too big (max size is %ld), sampling "
1983  "%ld / %ld vectors\n", nmax, n2, *n);
1984  }
1985  std::vector<int> subset (*n);
1986  rand_perm (subset.data (), *n, seed);
1987  float *x_subset = new float[n2 * d];
1988  for (long i = 0; i < n2; i++)
1989  memcpy (&x_subset[i * d],
1990  &x[subset[i] * size_t(d)],
1991  sizeof (x[0]) * d);
1992  *n = n2;
1993  return x_subset;
1994 }
1995 
1996 
1997 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:48
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1399
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:986
RandomGenerator(long seed=1234)
initialize
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:574
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1647
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1973
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1493
float rand_float()
between 0 and 1
Definition: utils.cpp:211
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1887
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1636
long rand_long()
random long &lt; 2 ^ 62
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1509
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1574
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1342
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1280
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:946
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:74
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1612
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:632
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1265
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1320
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1670
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1961
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:966