Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #include "utils.h"
13 
14 #include <cstdio>
15 #include <cassert>
16 #include <cstring>
17 #include <cmath>
18 
19 #include <immintrin.h>
20 
21 
22 #include <sys/time.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include <omp.h>
27 
28 
29 #include <algorithm>
30 #include <vector>
31 
32 #include "AuxIndexStructures.h"
33 #include "FaissAssert.h"
34 
35 
36 
37 #ifndef FINTEGER
38 #define FINTEGER long
39 #endif
40 
41 
42 extern "C" {
43 
44 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
45 
46 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
47  n, FINTEGER *k, const float *alpha, const float *a,
48  FINTEGER *lda, const float *b, FINTEGER *
49  ldb, float *beta, float *c, FINTEGER *ldc);
50 
51 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
52 
53 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
54  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
55 
56 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
57  FINTEGER *lda, float *tau, float *work,
58  FINTEGER *lwork, FINTEGER *info);
59 
60 
61 }
62 
63 
64 /**************************************************
65  * Get some stats about the system
66  **************************************************/
67 
68 namespace faiss {
69 
70 #ifdef __AVX__
71 #define USE_AVX
72 #endif
73 
74 double getmillisecs () {
75  struct timeval tv;
76  gettimeofday (&tv, nullptr);
77  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
78 }
79 
80 
81 #ifdef __linux__
82 
83 size_t get_mem_usage_kb ()
84 {
85  int pid = getpid ();
86  char fname[256];
87  snprintf (fname, 256, "/proc/%d/status", pid);
88  FILE * f = fopen (fname, "r");
89  FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
90  size_t sz = 0;
91  for (;;) {
92  char buf [256];
93  if (!fgets (buf, 256, f)) break;
94  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
95  }
96  fclose (f);
97  return sz;
98 }
99 
100 #elif __APPLE__
101 
102 size_t get_mem_usage_kb ()
103 {
104  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
105  return 0;
106 }
107 
108 #endif
109 
110 
111 
112 /**************************************************
113  * Random data generation functions
114  **************************************************/
115 
116 /**
117  * The definition of random functions depends on the architecture:
118  *
119  * - for Linux, we rely on re-entrant functions (random_r). This
120  * provides good quality reproducible random sequences.
121  *
122  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
123  * this function that it removed its definition form stdlib.h, so we
124  * re-declare it below. Fortunately, since it is deprecated, its
125  * prototype should not change much in the forerseeable future.
126  *
127  * Unfortunately, system designers are more concerned with making the
128  * most unpredictable random sequences for cryptographic use, when in
129  * scientific contexts what acutally matters is having reproducible
130  * squences in multi-threaded contexts.
131  */
132 
133 
134 #ifdef __linux__
135 
136 
137 
138 
140 {
141  int32_t a;
142  random_r (&rand_data, &a);
143  return a;
144 }
145 
147 {
148  int32_t a, b;
149  random_r (&rand_data, &a);
150  random_r (&rand_data, &b);
151  return long(a) | long(b) << 31;
152 }
153 
154 
156 {
157  memset (&rand_data, 0, sizeof (rand_data));
158  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
159 }
160 
161 
162 RandomGenerator::RandomGenerator (const RandomGenerator & other)
163 {
164  memcpy (rand_state, other.rand_state, sizeof(rand_state));
165  rand_data = other.rand_data;
166  setstate_r (rand_state, &rand_data);
167 }
168 
169 
170 #elif __APPLE__
171 
172 extern "C" {
173 int rand_r(unsigned *seed);
174 }
175 
177 {
178  rand_state = seed;
179 }
180 
181 
182 RandomGenerator::RandomGenerator (const RandomGenerator & other)
183 {
184  rand_state = other.rand_state;
185 }
186 
187 
189 {
190  // RAND_MAX is 31 bits
191  // try to add more randomness in the lower bits
192  int lowbits = rand_r(&rand_state) >> 15;
193  return rand_r(&rand_state) ^ lowbits;
194 }
195 
197 {
198  return long(random()) | long(random()) << 31;
199 }
200 
201 
202 
203 #endif
204 
206 { // this suffers form non-uniform probabilities when max is not a
207  // power of 2, but if RAND_MAX >> max the bias is limited.
208  return rand_int () % max;
209 }
210 
212 {
213  return rand_int() / float(1L << 31);
214 }
215 
216 double RandomGenerator::rand_double ()
217 {
218  return rand_long() / double(1L << 62);
219 }
220 
221 
222 /***********************************************************************
223  * Random functions in this C file only exist because Torch
224  * counterparts are slow and not multi-threaded. Typical use is for
225  * more than 1-100 billion values. */
226 
227 
228 /* Generate a set of random floating point values such that x[i] in [0,1]
229  multi-threading. For this reason, we rely on re-entreant functions. */
230 void float_rand (float * x, size_t n, long seed)
231 {
232  // only try to parallelize on large enough arrays
233  const size_t nblock = n < 1024 ? 1 : 1024;
234 
235  RandomGenerator rng0 (seed);
236  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
237 
238 #pragma omp parallel for
239  for (size_t j = 0; j < nblock; j++) {
240 
241  RandomGenerator rng (a0 + j * b0);
242 
243  const size_t istart = j * n / nblock;
244  const size_t iend = (j + 1) * n / nblock;
245 
246  for (size_t i = istart; i < iend; i++)
247  x[i] = rng.rand_float ();
248  }
249 }
250 
251 
252 void float_randn (float * x, size_t n, long seed)
253 {
254  // only try to parallelize on large enough arrays
255  const size_t nblock = n < 1024 ? 1 : 1024;
256 
257  RandomGenerator rng0 (seed);
258  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
259 
260 #pragma omp parallel for
261  for (size_t j = 0; j < nblock; j++) {
262  RandomGenerator rng (a0 + j * b0);
263 
264  double a = 0, b = 0, s = 0;
265  int state = 0; /* generate two number per "do-while" loop */
266 
267  const size_t istart = j * n / nblock;
268  const size_t iend = (j + 1) * n / nblock;
269 
270  for (size_t i = istart; i < iend; i++) {
271  /* Marsaglia's method (see Knuth) */
272  if (state == 0) {
273  do {
274  a = 2.0 * rng.rand_double () - 1;
275  b = 2.0 * rng.rand_double () - 1;
276  s = a * a + b * b;
277  } while (s >= 1.0);
278  x[i] = a * sqrt(-2.0 * log(s) / s);
279  }
280  else
281  x[i] = b * sqrt(-2.0 * log(s) / s);
282  state = 1 - state;
283  }
284  }
285 }
286 
287 
288 /* Integer versions */
289 void long_rand (long * x, size_t n, long seed)
290 {
291  // only try to parallelize on large enough arrays
292  const size_t nblock = n < 1024 ? 1 : 1024;
293 
294  RandomGenerator rng0 (seed);
295  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
296 
297 #pragma omp parallel for
298  for (size_t j = 0; j < nblock; j++) {
299 
300  RandomGenerator rng (a0 + j * b0);
301 
302  const size_t istart = j * n / nblock;
303  const size_t iend = (j + 1) * n / nblock;
304  for (size_t i = istart; i < iend; i++)
305  x[i] = rng.rand_long ();
306  }
307 }
308 
309 
310 
311 void rand_perm (int *perm, size_t n, long seed)
312 {
313  for (size_t i = 0; i < n; i++) perm[i] = i;
314 
315  RandomGenerator rng (seed);
316 
317  for (size_t i = 0; i + 1 < n; i++) {
318  int i2 = i + rng.rand_int (n - i);
319  std::swap(perm[i], perm[i2]);
320  }
321 }
322 
323 
324 
325 
326 void byte_rand (uint8_t * x, size_t n, long seed)
327 {
328  // only try to parallelize on large enough arrays
329  const size_t nblock = n < 1024 ? 1 : 1024;
330 
331  RandomGenerator rng0 (seed);
332  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
333 
334 #pragma omp parallel for
335  for (size_t j = 0; j < nblock; j++) {
336 
337  RandomGenerator rng (a0 + j * b0);
338 
339  const size_t istart = j * n / nblock;
340  const size_t iend = (j + 1) * n / nblock;
341 
342  size_t i;
343  for (i = istart; i < iend; i++)
344  x[i] = rng.rand_long ();
345  }
346 }
347 
348 
349 
350 void reflection (const float * __restrict u,
351  float * __restrict x,
352  size_t n, size_t d, size_t nu)
353 {
354  size_t i, j, l;
355  for (i = 0; i < n; i++) {
356  const float * up = u;
357  for (l = 0; l < nu; l++) {
358  float ip1 = 0, ip2 = 0;
359 
360  for (j = 0; j < d; j+=2) {
361  ip1 += up[j] * x[j];
362  ip2 += up[j+1] * x[j+1];
363  }
364  float ip = 2 * (ip1 + ip2);
365 
366  for (j = 0; j < d; j++)
367  x[j] -= ip * up[j];
368  up += d;
369  }
370  x += d;
371  }
372 }
373 
374 
375 /* Reference implementation (slower) */
376 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
377 {
378  size_t i, j, l;
379  for (i = 0; i < n; i++) {
380  const float * up = u;
381  for (l = 0; l < nu; l++) {
382  double ip = 0;
383 
384  for (j = 0; j < d; j++)
385  ip += up[j] * x[j];
386  ip *= 2;
387 
388  for (j = 0; j < d; j++)
389  x[j] -= ip * up[j];
390 
391  up += d;
392  }
393  x += d;
394  }
395 }
396 
397 /*********************************************************
398  * Optimized distance computations
399  *********************************************************/
400 
401 
402 
403 /* Functions to compute:
404  - L2 distance between 2 vectors
405  - inner product between 2 vectors
406  - L2 norm of a vector
407 
408  The functions should probably not be invoked when a large number of
409  vectors are be processed in batch (in which case Matrix multiply
410  is faster), but may be useful for comparing vectors isolated in
411  memory.
412 
413  Works with any vectors of any dimension, even unaligned (in which
414  case they are slower).
415 
416 */
417 
418 
419 /*********************************************************
420  * Reference implementations
421  */
422 
423 
424 
425 /* same without SSE */
426 float fvec_L2sqr_ref (const float * x,
427  const float * y,
428  size_t d)
429 {
430  size_t i;
431  float res_ = 0;
432  for (i = 0; i < d; i++) {
433  const float tmp = x[i] - y[i];
434  res_ += tmp * tmp;
435  }
436  return res_;
437 }
438 
439 float fvec_inner_product_ref (const float * x,
440  const float * y,
441  size_t d)
442 {
443  size_t i;
444  float res_ = 0;
445  for (i = 0; i < d; i++)
446  res_ += x[i] * y[i];
447  return res_;
448 }
449 
450 float fvec_norm_L2sqr_ref (const float * __restrict x,
451  size_t d)
452 {
453  size_t i;
454  double res_ = 0;
455  for (i = 0; i < d; i++)
456  res_ += x[i] * x[i];
457  return res_;
458 }
459 
460 
461 /*********************************************************
462  * SSE and AVX implementations
463  */
464 
465 // reads 0 <= d < 4 floats as __m128
466 static inline __m128 masked_read (int d, const float *x)
467 {
468  assert (0 <= d && d < 4);
469  __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
470  switch (d) {
471  case 3:
472  buf[2] = x[2];
473  case 2:
474  buf[1] = x[1];
475  case 1:
476  buf[0] = x[0];
477  }
478  return _mm_load_ps (buf);
479  // cannot use AVX2 _mm_mask_set1_epi32
480 }
481 
482 #ifdef USE_AVX
483 
484 // reads 0 <= d < 8 floats as __m256
485 static inline __m256 masked_read_8 (int d, const float *x)
486 {
487  assert (0 <= d && d < 8);
488  if (d < 4) {
489  __m256 res = _mm256_setzero_ps ();
490  res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
491  return res;
492  } else {
493  __m256 res = _mm256_setzero_ps ();
494  res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
495  res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
496  return res;
497  }
498 }
499 
500 float fvec_inner_product (const float * x,
501  const float * y,
502  size_t d)
503 {
504  __m256 msum1 = _mm256_setzero_ps();
505 
506  while (d >= 8) {
507  __m256 mx = _mm256_loadu_ps (x); x += 8;
508  __m256 my = _mm256_loadu_ps (y); y += 8;
509  msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
510  d -= 8;
511  }
512 
513  __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
514  msum2 += _mm256_extractf128_ps(msum1, 0);
515 
516  if (d >= 4) {
517  __m128 mx = _mm_loadu_ps (x); x += 4;
518  __m128 my = _mm_loadu_ps (y); y += 4;
519  msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
520  d -= 4;
521  }
522 
523  if (d > 0) {
524  __m128 mx = masked_read (d, x);
525  __m128 my = masked_read (d, y);
526  msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
527  }
528 
529  msum2 = _mm_hadd_ps (msum2, msum2);
530  msum2 = _mm_hadd_ps (msum2, msum2);
531  return _mm_cvtss_f32 (msum2);
532 }
533 
534 float fvec_L2sqr (const float * x,
535  const float * y,
536  size_t d)
537 {
538  __m256 msum1 = _mm256_setzero_ps();
539 
540  while (d >= 8) {
541  __m256 mx = _mm256_loadu_ps (x); x += 8;
542  __m256 my = _mm256_loadu_ps (y); y += 8;
543  const __m256 a_m_b1 = mx - my;
544  msum1 += a_m_b1 * a_m_b1;
545  d -= 8;
546  }
547 
548  __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
549  msum2 += _mm256_extractf128_ps(msum1, 0);
550 
551  if (d >= 4) {
552  __m128 mx = _mm_loadu_ps (x); x += 4;
553  __m128 my = _mm_loadu_ps (y); y += 4;
554  const __m128 a_m_b1 = mx - my;
555  msum2 += a_m_b1 * a_m_b1;
556  d -= 4;
557  }
558 
559  if (d > 0) {
560  __m128 mx = masked_read (d, x);
561  __m128 my = masked_read (d, y);
562  __m128 a_m_b1 = mx - my;
563  msum2 += a_m_b1 * a_m_b1;
564  }
565 
566  msum2 = _mm_hadd_ps (msum2, msum2);
567  msum2 = _mm_hadd_ps (msum2, msum2);
568  return _mm_cvtss_f32 (msum2);
569 }
570 
571 #else
572 
573 /* SSE-implementation of L2 distance */
574 float fvec_L2sqr (const float * x,
575  const float * y,
576  size_t d)
577 {
578  __m128 msum1 = _mm_setzero_ps();
579 
580  while (d >= 4) {
581  __m128 mx = _mm_loadu_ps (x); x += 4;
582  __m128 my = _mm_loadu_ps (y); y += 4;
583  const __m128 a_m_b1 = mx - my;
584  msum1 += a_m_b1 * a_m_b1;
585  d -= 4;
586  }
587 
588  if (d > 0) {
589  // add the last 1, 2 or 3 values
590  __m128 mx = masked_read (d, x);
591  __m128 my = masked_read (d, y);
592  __m128 a_m_b1 = mx - my;
593  msum1 += a_m_b1 * a_m_b1;
594  }
595 
596  msum1 = _mm_hadd_ps (msum1, msum1);
597  msum1 = _mm_hadd_ps (msum1, msum1);
598  return _mm_cvtss_f32 (msum1);
599 }
600 
601 
602 float fvec_inner_product (const float * x,
603  const float * y,
604  size_t d)
605 {
606  __m128 mx, my;
607  __m128 msum1 = _mm_setzero_ps();
608 
609  while (d >= 4) {
610  mx = _mm_loadu_ps (x); x += 4;
611  my = _mm_loadu_ps (y); y += 4;
612  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
613  d -= 4;
614  }
615 
616  // add the last 1, 2, or 3 values
617  mx = masked_read (d, x);
618  my = masked_read (d, y);
619  __m128 prod = _mm_mul_ps (mx, my);
620 
621  msum1 = _mm_add_ps (msum1, prod);
622 
623  msum1 = _mm_hadd_ps (msum1, msum1);
624  msum1 = _mm_hadd_ps (msum1, msum1);
625  return _mm_cvtss_f32 (msum1);
626 }
627 
628 
629 
630 #endif
631 
632 float fvec_norm_L2sqr (const float * x,
633  size_t d)
634 {
635  __m128 mx;
636  __m128 msum1 = _mm_setzero_ps();
637 
638  while (d >= 4) {
639  mx = _mm_loadu_ps (x); x += 4;
640  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
641  d -= 4;
642  }
643 
644  mx = masked_read (d, x);
645  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
646 
647  msum1 = _mm_hadd_ps (msum1, msum1);
648  msum1 = _mm_hadd_ps (msum1, msum1);
649  return _mm_cvtss_f32 (msum1);
650 }
651 
652 
653 
654 
655 /***************************************************************************
656  * Matrix/vector ops
657  ***************************************************************************/
658 
659 
660 
661 /* Compute the inner product between a vector x and
662  a set of ny vectors y.
663  These functions are not intended to replace BLAS matrix-matrix, as they
664  would be significantly less efficient in this case. */
665 void fvec_inner_products_ny (float * __restrict ip,
666  const float * x,
667  const float * y,
668  size_t d, size_t ny)
669 {
670  for (size_t i = 0; i < ny; i++) {
671  ip[i] = fvec_inner_product (x, y, d);
672  y += d;
673  }
674 }
675 
676 
677 
678 
679 /* compute ny L2 distances between x and a set of vectors y */
680 void fvec_L2sqr_ny (float * __restrict dis,
681  const float * x,
682  const float * y,
683  size_t d, size_t ny)
684 {
685  for (size_t i = 0; i < ny; i++) {
686  dis[i] = fvec_L2sqr (x, y, d);
687  y += d;
688  }
689 }
690 
691 
692 
693 
694 /* Compute the L2 norm of a set of nx vectors */
695 void fvec_norms_L2 (float * __restrict nr,
696  const float * __restrict x,
697  size_t d, size_t nx)
698 {
699 
700 #pragma omp parallel for
701  for (size_t i = 0; i < nx; i++) {
702  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
703  }
704 }
705 
706 void fvec_norms_L2sqr (float * __restrict nr,
707  const float * __restrict x,
708  size_t d, size_t nx)
709 {
710 #pragma omp parallel for
711  for (size_t i = 0; i < nx; i++)
712  nr[i] = fvec_norm_L2sqr (x + i * d, d);
713 }
714 
715 
716 
717 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
718 {
719 #pragma omp parallel for
720  for (size_t i = 0; i < nx; i++) {
721  float * __restrict xi = x + i * d;
722 
723  float nr = fvec_norm_L2sqr (xi, d);
724 
725  if (nr > 0) {
726  size_t j;
727  const float inv_nr = 1.0 / sqrtf (nr);
728  for (j = 0; j < d; j++)
729  xi[j] *= inv_nr;
730  }
731  }
732 }
733 
734 
735 
736 
737 
738 
739 
740 
741 
742 
743 
744 
745 
746 
747 
748 
749 
750 /***************************************************************************
751  * KNN functions
752  ***************************************************************************/
753 
754 
755 
756 /* Find the nearest neighbors for nx queries in a set of ny vectors */
757 static void knn_inner_product_sse (const float * x,
758  const float * y,
759  size_t d, size_t nx, size_t ny,
760  float_minheap_array_t * res)
761 {
762  size_t k = res->k;
763 
764 #pragma omp parallel for
765  for (size_t i = 0; i < nx; i++) {
766  const float * x_ = x + i * d;
767  const float * y_ = y;
768 
769  float * __restrict simi = res->get_val(i);
770  long * __restrict idxi = res->get_ids (i);
771 
772  minheap_heapify (k, simi, idxi);
773 
774  for (size_t j = 0; j < ny; j++) {
775  float ip = fvec_inner_product (x_, y_, d);
776 
777  if (ip > simi[0]) {
778  minheap_pop (k, simi, idxi);
779  minheap_push (k, simi, idxi, ip, j);
780  }
781  y_ += d;
782  }
783  minheap_reorder (k, simi, idxi);
784  }
785 
786 }
787 
788 static void knn_L2sqr_sse (
789  const float * x,
790  const float * y,
791  size_t d, size_t nx, size_t ny,
792  float_maxheap_array_t * res)
793 {
794  size_t k = res->k;
795 
796 #pragma omp parallel for
797  for (size_t i = 0; i < nx; i++) {
798  const float * x_ = x + i * d;
799  const float * y_ = y;
800  size_t j;
801  float * __restrict simi = res->get_val(i);
802  long * __restrict idxi = res->get_ids (i);
803 
804  maxheap_heapify (k, simi, idxi);
805  for (j = 0; j < ny; j++) {
806  float disij = fvec_L2sqr (x_, y_, d);
807 
808  if (disij < simi[0]) {
809  maxheap_pop (k, simi, idxi);
810  maxheap_push (k, simi, idxi, disij, j);
811  }
812  y_ += d;
813  }
814  maxheap_reorder (k, simi, idxi);
815  }
816 
817 }
818 
819 
820 /** Find the nearest neighbors for nx queries in a set of ny vectors */
821 static void knn_inner_product_blas (
822  const float * x,
823  const float * y,
824  size_t d, size_t nx, size_t ny,
825  float_minheap_array_t * res)
826 {
827  res->heapify ();
828 
829  // BLAS does not like empty matrices
830  if (nx == 0 || ny == 0) return;
831 
832  /* block sizes */
833  const size_t bs_x = 4096, bs_y = 1024;
834  // const size_t bs_x = 16, bs_y = 16;
835  float *ip_block = new float[bs_x * bs_y];
836 
837  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
838  size_t i1 = i0 + bs_x;
839  if(i1 > nx) i1 = nx;
840 
841  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
842  size_t j1 = j0 + bs_y;
843  if (j1 > ny) j1 = ny;
844  /* compute the actual dot products */
845  {
846  float one = 1, zero = 0;
847  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
848  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
849  y + j0 * d, &di,
850  x + i0 * d, &di, &zero,
851  ip_block, &nyi);
852  }
853 
854  /* collect maxima */
855  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
856  }
857  }
858  delete [] ip_block;
859  res->reorder ();
860 }
861 
862 // distance correction is an operator that can be applied to transform
863 // the distances
864 template<class DistanceCorrection>
865 static void knn_L2sqr_blas (const float * x,
866  const float * y,
867  size_t d, size_t nx, size_t ny,
868  float_maxheap_array_t * res,
869  const DistanceCorrection &corr)
870 {
871  res->heapify ();
872 
873  // BLAS does not like empty matrices
874  if (nx == 0 || ny == 0) return;
875 
876  size_t k = res->k;
877 
878  /* block sizes */
879  const size_t bs_x = 4096, bs_y = 1024;
880  // const size_t bs_x = 16, bs_y = 16;
881  float *ip_block = new float[bs_x * bs_y];
882 
883  float *x_norms = new float[nx];
884  fvec_norms_L2sqr (x_norms, x, d, nx);
885 
886  float *y_norms = new float[ny];
887  fvec_norms_L2sqr (y_norms, y, d, ny);
888 
889  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
890  size_t i1 = i0 + bs_x;
891  if(i1 > nx) i1 = nx;
892 
893  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
894  size_t j1 = j0 + bs_y;
895  if (j1 > ny) j1 = ny;
896  /* compute the actual dot products */
897  {
898  float one = 1, zero = 0;
899  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
900  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
901  y + j0 * d, &di,
902  x + i0 * d, &di, &zero,
903  ip_block, &nyi);
904  }
905 
906  /* collect minima */
907 #pragma omp parallel for
908  for (size_t i = i0; i < i1; i++) {
909  float * __restrict simi = res->get_val(i);
910  long * __restrict idxi = res->get_ids (i);
911  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
912 
913  for (size_t j = j0; j < j1; j++) {
914  float ip = *ip_line++;
915  float dis = x_norms[i] + y_norms[j] - 2 * ip;
916 
917  dis = corr (dis, i, j);
918 
919  if (dis < simi[0]) {
920  maxheap_pop (k, simi, idxi);
921  maxheap_push (k, simi, idxi, dis, j);
922  }
923  }
924  }
925  }
926  }
927  res->reorder ();
928 
929  delete [] ip_block;
930  delete [] x_norms;
931  delete [] y_norms;
932 }
933 
934 
935 
936 
937 
938 
939 
940 
941 
942 /*******************************************************
943  * KNN driver functions
944  *******************************************************/
945 
946 int distance_compute_blas_threshold = 20;
947 
948 void knn_inner_product (const float * x,
949  const float * y,
950  size_t d, size_t nx, size_t ny,
951  float_minheap_array_t * res)
952 {
953  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
954  knn_inner_product_sse (x, y, d, nx, ny, res);
955  } else {
956  knn_inner_product_blas (x, y, d, nx, ny, res);
957  }
958 }
959 
960 
961 
963  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
964  return dis;
965  }
966 };
967 
968 void knn_L2sqr (const float * x,
969  const float * y,
970  size_t d, size_t nx, size_t ny,
971  float_maxheap_array_t * res)
972 {
973  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
974  knn_L2sqr_sse (x, y, d, nx, ny, res);
975  } else {
977  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
978  }
979 }
980 
982  const float *base_shift;
983  float operator()(float dis, size_t /*qno*/, size_t bno) const {
984  return dis - base_shift[bno];
985  }
986 };
987 
989  const float * x,
990  const float * y,
991  size_t d, size_t nx, size_t ny,
992  float_maxheap_array_t * res,
993  const float *base_shift)
994 {
995  BaseShiftDistanceCorrection corr = {base_shift};
996  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
997 }
998 
999 
1000 
1001 /***************************************************************************
1002  * compute a subset of distances
1003  ***************************************************************************/
1004 
1005 /* compute the inner product between x and a subset y of ny vectors,
1006  whose indices are given by idy. */
1007 void fvec_inner_products_by_idx (float * __restrict ip,
1008  const float * x,
1009  const float * y,
1010  const long * __restrict ids, /* for y vecs */
1011  size_t d, size_t nx, size_t ny)
1012 {
1013 #pragma omp parallel for
1014  for (size_t j = 0; j < nx; j++) {
1015  const long * __restrict idsj = ids + j * ny;
1016  const float * xj = x + j * d;
1017  float * __restrict ipj = ip + j * ny;
1018  for (size_t i = 0; i < ny; i++) {
1019  if (idsj[i] < 0)
1020  continue;
1021  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
1022  }
1023  }
1024 }
1025 
1026 /* compute the inner product between x and a subset y of ny vectors,
1027  whose indices are given by idy. */
1028 void fvec_L2sqr_by_idx (float * __restrict dis,
1029  const float * x,
1030  const float * y,
1031  const long * __restrict ids, /* ids of y vecs */
1032  size_t d, size_t nx, size_t ny)
1033 {
1034 #pragma omp parallel for
1035  for (size_t j = 0; j < nx; j++) {
1036  const long * __restrict idsj = ids + j * ny;
1037  const float * xj = x + j * d;
1038  float * __restrict disj = dis + j * ny;
1039  for (size_t i = 0; i < ny; i++) {
1040  if (idsj[i] < 0)
1041  continue;
1042  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
1043  }
1044  }
1045 }
1046 
1047 
1048 
1049 
1050 
1051 /* Find the nearest neighbors for nx queries in a set of ny vectors
1052  indexed by ids. May be useful for re-ranking a pre-selected vector list */
1053 void knn_inner_products_by_idx (const float * x,
1054  const float * y,
1055  const long * ids,
1056  size_t d, size_t nx, size_t ny,
1057  float_minheap_array_t * res)
1058 {
1059  size_t k = res->k;
1060 
1061 #pragma omp parallel for
1062  for (size_t i = 0; i < nx; i++) {
1063  const float * x_ = x + i * d;
1064  const long * idsi = ids + i * ny;
1065  size_t j;
1066  float * __restrict simi = res->get_val(i);
1067  long * __restrict idxi = res->get_ids (i);
1068  minheap_heapify (k, simi, idxi);
1069 
1070  for (j = 0; j < ny; j++) {
1071  if (idsi[j] < 0) break;
1072  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
1073 
1074  if (ip > simi[0]) {
1075  minheap_pop (k, simi, idxi);
1076  minheap_push (k, simi, idxi, ip, idsi[j]);
1077  }
1078  }
1079  minheap_reorder (k, simi, idxi);
1080  }
1081 
1082 }
1083 
1084 void knn_L2sqr_by_idx (const float * x,
1085  const float * y,
1086  const long * __restrict ids,
1087  size_t d, size_t nx, size_t ny,
1088  float_maxheap_array_t * res)
1089 {
1090  size_t k = res->k;
1091 
1092 #pragma omp parallel for
1093  for (size_t i = 0; i < nx; i++) {
1094  const float * x_ = x + i * d;
1095  const long * __restrict idsi = ids + i * ny;
1096  float * __restrict simi = res->get_val(i);
1097  long * __restrict idxi = res->get_ids (i);
1098  maxheap_heapify (res->k, simi, idxi);
1099  for (size_t j = 0; j < ny; j++) {
1100  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
1101 
1102  if (disij < simi[0]) {
1103  maxheap_pop (k, simi, idxi);
1104  maxheap_push (k, simi, idxi, disij, idsi[j]);
1105  }
1106  }
1107  maxheap_reorder (res->k, simi, idxi);
1108  }
1109 
1110 }
1111 
1112 
1113 
1114 
1115 
1116 /***************************************************************************
1117  * Range search
1118  ***************************************************************************/
1119 
1120 /** Find the nearest neighbors for nx queries in a set of ny vectors
1121  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1122  */
1123 template <bool compute_l2>
1124 static void range_search_blas (
1125  const float * x,
1126  const float * y,
1127  size_t d, size_t nx, size_t ny,
1128  float radius,
1129  RangeSearchResult *result)
1130 {
1131 
1132  // BLAS does not like empty matrices
1133  if (nx == 0 || ny == 0) return;
1134 
1135  /* block sizes */
1136  const size_t bs_x = 4096, bs_y = 1024;
1137  // const size_t bs_x = 16, bs_y = 16;
1138  float *ip_block = new float[bs_x * bs_y];
1139 
1140  float *x_norms = nullptr, *y_norms = nullptr;
1141 
1142  if (compute_l2) {
1143  x_norms = new float[nx];
1144  fvec_norms_L2sqr (x_norms, x, d, nx);
1145  y_norms = new float[ny];
1146  fvec_norms_L2sqr (y_norms, y, d, ny);
1147  }
1148 
1149  std::vector <RangeSearchPartialResult *> partial_results;
1150 
1151  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1152  size_t j1 = j0 + bs_y;
1153  if (j1 > ny) j1 = ny;
1154  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1155  partial_results.push_back (pres);
1156 
1157  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1158  size_t i1 = i0 + bs_x;
1159  if(i1 > nx) i1 = nx;
1160 
1161  /* compute the actual dot products */
1162  {
1163  float one = 1, zero = 0;
1164  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1165  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1166  y + j0 * d, &di,
1167  x + i0 * d, &di, &zero,
1168  ip_block, &nyi);
1169  }
1170 
1171 
1172  for (size_t i = i0; i < i1; i++) {
1173  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1174 
1175  RangeSearchPartialResult::QueryResult & qres =
1176  pres->new_result (i);
1177 
1178  for (size_t j = j0; j < j1; j++) {
1179  float ip = *ip_line++;
1180  if (compute_l2) {
1181  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1182  if (dis < radius) {
1183  qres.add (dis, j);
1184  }
1185  } else {
1186  if (ip > radius) {
1187  qres.add (ip, j);
1188  }
1189  }
1190  }
1191  }
1192  }
1193 
1194  }
1195  delete [] ip_block;
1196  delete [] x_norms;
1197  delete [] y_norms;
1198 
1199  { // merge the partial results
1200  int npres = partial_results.size();
1201  // count
1202  for (size_t i = 0; i < nx; i++) {
1203  for (int j = 0; j < npres; j++)
1204  result->lims[i] += partial_results[j]->queries[i].nres;
1205  }
1206  result->do_allocation ();
1207  for (int j = 0; j < npres; j++) {
1208  partial_results[j]->set_result (true);
1209  delete partial_results[j];
1210  }
1211 
1212  // reset the limits
1213  for (size_t i = nx; i > 0; i--) {
1214  result->lims [i] = result->lims [i - 1];
1215  }
1216  result->lims [0] = 0;
1217  }
1218 }
1219 
1220 
1221 template <bool compute_l2>
1222 static void range_search_sse (const float * x,
1223  const float * y,
1224  size_t d, size_t nx, size_t ny,
1225  float radius,
1226  RangeSearchResult *res)
1227 {
1228  FAISS_THROW_IF_NOT (d % 4 == 0);
1229 
1230 #pragma omp parallel
1231  {
1232  RangeSearchPartialResult pres (res);
1233 
1234 #pragma omp for
1235  for (size_t i = 0; i < nx; i++) {
1236  const float * x_ = x + i * d;
1237  const float * y_ = y;
1238  size_t j;
1239 
1240  RangeSearchPartialResult::QueryResult & qres =
1241  pres.new_result (i);
1242 
1243  for (j = 0; j < ny; j++) {
1244  if (compute_l2) {
1245  float disij = fvec_L2sqr (x_, y_, d);
1246  if (disij < radius) {
1247  qres.add (disij, j);
1248  }
1249  } else {
1250  float ip = fvec_inner_product (x_, y_, d);
1251  if (ip > radius) {
1252  qres.add (ip, j);
1253  }
1254  }
1255  y_ += d;
1256  }
1257 
1258  }
1259  pres.finalize ();
1260  }
1261 }
1262 
1263 
1264 
1265 
1266 
1268  const float * x,
1269  const float * y,
1270  size_t d, size_t nx, size_t ny,
1271  float radius,
1272  RangeSearchResult *res)
1273 {
1274 
1275  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
1276  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1277  } else {
1278  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1279  }
1280 }
1281 
1283  const float * x,
1284  const float * y,
1285  size_t d, size_t nx, size_t ny,
1286  float radius,
1287  RangeSearchResult *res)
1288 {
1289 
1290  if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
1291  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1292  } else {
1293  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1294  }
1295 }
1296 
1297 
1298 
1299 /***************************************************************************
1300  * Some matrix manipulation functions
1301  ***************************************************************************/
1302 
1303 
1304 /* This function exists because the Torch counterpart is extremly slow
1305  (not multi-threaded + unexpected overhead even in single thread).
1306  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1307 void inner_product_to_L2sqr (float * __restrict dis,
1308  const float * nr1,
1309  const float * nr2,
1310  size_t n1, size_t n2)
1311 {
1312 
1313 #pragma omp parallel for
1314  for (size_t j = 0 ; j < n1 ; j++) {
1315  float * disj = dis + j * n2;
1316  for (size_t i = 0 ; i < n2 ; i++)
1317  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1318  }
1319 }
1320 
1321 
1322 void matrix_qr (int m, int n, float *a)
1323 {
1324  FAISS_THROW_IF_NOT (m >= n);
1325  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1326  std::vector<float> tau (ki);
1327  FINTEGER lwork = -1, info;
1328  float work_size;
1329 
1330  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1331  &work_size, &lwork, &info);
1332  lwork = size_t(work_size);
1333  std::vector<float> work (lwork);
1334 
1335  sgeqrf_ (&mi, &ni, a, &mi,
1336  tau.data(), work.data(), &lwork, &info);
1337 
1338  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1339  work.data(), &lwork, &info);
1340 
1341 }
1342 
1343 
1344 void pairwise_L2sqr (long d,
1345  long nq, const float *xq,
1346  long nb, const float *xb,
1347  float *dis,
1348  long ldq, long ldb, long ldd)
1349 {
1350  if (nq == 0 || nb == 0) return;
1351  if (ldq == -1) ldq = d;
1352  if (ldb == -1) ldb = d;
1353  if (ldd == -1) ldd = nb;
1354 
1355  // store in beginning of distance matrix to avoid malloc
1356  float *b_norms = dis;
1357 
1358 #pragma omp parallel for
1359  for (long i = 0; i < nb; i++)
1360  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1361 
1362 #pragma omp parallel for
1363  for (long i = 1; i < nq; i++) {
1364  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1365  for (long j = 0; j < nb; j++)
1366  dis[i * ldd + j] = q_norm + b_norms [j];
1367  }
1368 
1369  {
1370  float q_norm = fvec_norm_L2sqr (xq, d);
1371  for (long j = 0; j < nb; j++)
1372  dis[j] += q_norm;
1373  }
1374 
1375  {
1376  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1377  float one = 1.0, minus_2 = -2.0;
1378 
1379  sgemm_ ("Transposed", "Not transposed",
1380  &nbi, &nqi, &di,
1381  &minus_2,
1382  xb, &ldbi,
1383  xq, &ldqi,
1384  &one, dis, &lddi);
1385  }
1386 
1387 
1388 }
1389 
1390 
1391 
1392 /***************************************************************************
1393  * Kmeans subroutine
1394  ***************************************************************************/
1395 
1396 // a bit above machine epsilon for float16
1397 
1398 #define EPS (1 / 1024.)
1399 
1400 /* For k-means, compute centroids given assignment of vectors to centroids */
1401 int km_update_centroids (const float * x,
1402  float * centroids,
1403  long * assign,
1404  size_t d, size_t k, size_t n,
1405  size_t k_frozen)
1406 {
1407  k -= k_frozen;
1408  centroids += k_frozen * d;
1409 
1410  std::vector<size_t> hassign(k);
1411  memset (centroids, 0, sizeof(*centroids) * d * k);
1412 
1413 #pragma omp parallel
1414  {
1415  int nt = omp_get_num_threads();
1416  int rank = omp_get_thread_num();
1417  // this thread is taking care of centroids c0:c1
1418  size_t c0 = (k * rank) / nt;
1419  size_t c1 = (k * (rank + 1)) / nt;
1420  const float *xi = x;
1421  size_t nacc = 0;
1422 
1423  for (size_t i = 0; i < n; i++) {
1424  long ci = assign[i];
1425  assert (ci >= 0 && ci < k + k_frozen);
1426  ci -= k_frozen;
1427  if (ci >= c0 && ci < c1) {
1428  float * c = centroids + ci * d;
1429  hassign[ci]++;
1430  for (size_t j = 0; j < d; j++)
1431  c[j] += xi[j];
1432  nacc++;
1433  }
1434  xi += d;
1435  }
1436 
1437  }
1438 
1439 #pragma omp parallel for
1440  for (size_t ci = 0; ci < k; ci++) {
1441  float * c = centroids + ci * d;
1442  float ni = (float) hassign[ci];
1443  if (ni != 0) {
1444  for (size_t j = 0; j < d; j++)
1445  c[j] /= ni;
1446  }
1447  }
1448 
1449  /* Take care of void clusters */
1450  size_t nsplit = 0;
1451  RandomGenerator rng (1234);
1452  for (size_t ci = 0; ci < k; ci++) {
1453  if (hassign[ci] == 0) { /* need to redefine a centroid */
1454  size_t cj;
1455  for (cj = 0; 1; cj = (cj + 1) % k) {
1456  /* probability to pick this cluster for split */
1457  float p = (hassign[cj] - 1.0) / (float) (n - k);
1458  float r = rng.rand_float ();
1459  if (r < p) {
1460  break; /* found our cluster to be split */
1461  }
1462  }
1463  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1464 
1465  /* small symmetric pertubation. Much better than */
1466  for (size_t j = 0; j < d; j++) {
1467  if (j % 2 == 0) {
1468  centroids[ci * d + j] *= 1 + EPS;
1469  centroids[cj * d + j] *= 1 - EPS;
1470  } else {
1471  centroids[ci * d + j] *= 1 - EPS;
1472  centroids[cj * d + j] *= 1 + EPS;
1473  }
1474  }
1475 
1476  /* assume even split of the cluster */
1477  hassign[ci] = hassign[cj] / 2;
1478  hassign[cj] -= hassign[ci];
1479  nsplit++;
1480  }
1481  }
1482 
1483  return nsplit;
1484 }
1485 
1486 #undef EPS
1487 
1488 
1489 
1490 /***************************************************************************
1491  * Result list routines
1492  ***************************************************************************/
1493 
1494 
1495 void ranklist_handle_ties (int k, long *idx, const float *dis)
1496 {
1497  float prev_dis = -1e38;
1498  int prev_i = -1;
1499  for (int i = 0; i < k; i++) {
1500  if (dis[i] != prev_dis) {
1501  if (i > prev_i + 1) {
1502  // sort between prev_i and i - 1
1503  std::sort (idx + prev_i, idx + i);
1504  }
1505  prev_i = i;
1506  prev_dis = dis[i];
1507  }
1508  }
1509 }
1510 
1511 size_t merge_result_table_with (size_t n, size_t k,
1512  long *I0, float *D0,
1513  const long *I1, const float *D1,
1514  bool keep_min,
1515  long translation)
1516 {
1517  size_t n1 = 0;
1518 
1519 #pragma omp parallel reduction(+:n1)
1520  {
1521  std::vector<long> tmpI (k);
1522  std::vector<float> tmpD (k);
1523 
1524 #pragma omp for
1525  for (size_t i = 0; i < n; i++) {
1526  long *lI0 = I0 + i * k;
1527  float *lD0 = D0 + i * k;
1528  const long *lI1 = I1 + i * k;
1529  const float *lD1 = D1 + i * k;
1530  size_t r0 = 0;
1531  size_t r1 = 0;
1532 
1533  if (keep_min) {
1534  for (size_t j = 0; j < k; j++) {
1535 
1536  if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1537  tmpD[j] = lD0[r0];
1538  tmpI[j] = lI0[r0];
1539  r0++;
1540  } else if (lD1[r1] >= 0) {
1541  tmpD[j] = lD1[r1];
1542  tmpI[j] = lI1[r1] + translation;
1543  r1++;
1544  } else { // both are NaNs
1545  tmpD[j] = NAN;
1546  tmpI[j] = -1;
1547  }
1548  }
1549  } else {
1550  for (size_t j = 0; j < k; j++) {
1551  if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1552  tmpD[j] = lD0[r0];
1553  tmpI[j] = lI0[r0];
1554  r0++;
1555  } else if (lD1[r1] >= 0) {
1556  tmpD[j] = lD1[r1];
1557  tmpI[j] = lI1[r1] + translation;
1558  r1++;
1559  } else { // both are NaNs
1560  tmpD[j] = NAN;
1561  tmpI[j] = -1;
1562  }
1563  }
1564  }
1565  n1 += r1;
1566  memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
1567  memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
1568  }
1569  }
1570 
1571  return n1;
1572 }
1573 
1574 
1575 
1576 size_t ranklist_intersection_size (size_t k1, const long *v1,
1577  size_t k2, const long *v2_in)
1578 {
1579  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1580  long *v2 = new long [k2];
1581  memcpy (v2, v2_in, sizeof (long) * k2);
1582  std::sort (v2, v2 + k2);
1583  { // de-dup v2
1584  long prev = -1;
1585  size_t wp = 0;
1586  for (size_t i = 0; i < k2; i++) {
1587  if (v2 [i] != prev) {
1588  v2[wp++] = prev = v2 [i];
1589  }
1590  }
1591  k2 = wp;
1592  }
1593  const long seen_flag = 1L << 60;
1594  size_t count = 0;
1595  for (size_t i = 0; i < k1; i++) {
1596  long q = v1 [i];
1597  size_t i0 = 0, i1 = k2;
1598  while (i0 + 1 < i1) {
1599  size_t imed = (i1 + i0) / 2;
1600  long piv = v2 [imed] & ~seen_flag;
1601  if (piv <= q) i0 = imed;
1602  else i1 = imed;
1603  }
1604  if (v2 [i0] == q) {
1605  count++;
1606  v2 [i0] |= seen_flag;
1607  }
1608  }
1609  delete [] v2;
1610 
1611  return count;
1612 }
1613 
1614 double imbalance_factor (int k, const int *hist) {
1615  double tot = 0, uf = 0;
1616 
1617  for (int i = 0 ; i < k ; i++) {
1618  tot += hist[i];
1619  uf += hist[i] * (double) hist[i];
1620  }
1621  uf = uf * k / (tot * tot);
1622 
1623  return uf;
1624 }
1625 
1626 
1627 double imbalance_factor (int n, int k, const long *assign) {
1628  std::vector<int> hist(k, 0);
1629  for (int i = 0; i < n; i++) {
1630  hist[assign[i]]++;
1631  }
1632 
1633  return imbalance_factor (k, hist.data());
1634 }
1635 
1636 
1637 
1638 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1639  memset (hist, 0, sizeof(hist[0]) * vmax);
1640  int nout = 0;
1641  while (n--) {
1642  if (v[n] < 0 || v[n] >= vmax) nout++;
1643  else hist[v[n]]++;
1644  }
1645  return nout;
1646 }
1647 
1648 
1649 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1650 {
1651  FAISS_THROW_IF_NOT (nbits % 8 == 0);
1652  size_t d = nbits / 8;
1653  std::vector<int> accu(d * 256);
1654  const uint8_t *c = codes;
1655  for (size_t i = 0; i < n; i++)
1656  for(int j = 0; j < d; j++)
1657  accu[j * 256 + *c++]++;
1658  memset (hist, 0, sizeof(*hist) * nbits);
1659  for (int i = 0; i < d; i++) {
1660  const int *ai = accu.data() + i * 256;
1661  int * hi = hist + i * 8;
1662  for (int j = 0; j < 256; j++)
1663  for (int k = 0; k < 8; k++)
1664  if ((j >> k) & 1)
1665  hi[k] += ai[j];
1666  }
1667 
1668 }
1669 
1670 
1671 
1672 size_t ivec_checksum (size_t n, const int *a)
1673 {
1674  size_t cs = 112909;
1675  while (n--) cs = cs * 65713 + a[n] * 1686049;
1676  return cs;
1677 }
1678 
1679 
1680 namespace {
1681  struct ArgsortComparator {
1682  const float *vals;
1683  bool operator() (const size_t a, const size_t b) const {
1684  return vals[a] < vals[b];
1685  }
1686  };
1687 
1688  struct SegmentS {
1689  size_t i0; // begin pointer in the permutation array
1690  size_t i1; // end
1691  size_t len() const {
1692  return i1 - i0;
1693  }
1694  };
1695 
1696  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1697  // extended to > 1 merge thread
1698 
1699  // merges 2 ranges that should be consecutive on the source into
1700  // the union of the two on the destination
1701  template<typename T>
1702  void parallel_merge (const T *src, T *dst,
1703  SegmentS &s1, SegmentS & s2, int nt,
1704  const ArgsortComparator & comp) {
1705  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1706  std::swap(s1, s2);
1707  }
1708 
1709  // compute sub-ranges for each thread
1710  SegmentS s1s[nt], s2s[nt], sws[nt];
1711  s2s[0].i0 = s2.i0;
1712  s2s[nt - 1].i1 = s2.i1;
1713 
1714  // not sure parallel actually helps here
1715 #pragma omp parallel for num_threads(nt)
1716  for (int t = 0; t < nt; t++) {
1717  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1718  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1719 
1720  if (t + 1 < nt) {
1721  T pivot = src[s1s[t].i1];
1722  size_t i0 = s2.i0, i1 = s2.i1;
1723  while (i0 + 1 < i1) {
1724  size_t imed = (i1 + i0) / 2;
1725  if (comp (pivot, src[imed])) {i1 = imed; }
1726  else {i0 = imed; }
1727  }
1728  s2s[t].i1 = s2s[t + 1].i0 = i1;
1729  }
1730  }
1731  s1.i0 = std::min(s1.i0, s2.i0);
1732  s1.i1 = std::max(s1.i1, s2.i1);
1733  s2 = s1;
1734  sws[0].i0 = s1.i0;
1735  for (int t = 0; t < nt; t++) {
1736  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1737  if (t + 1 < nt) {
1738  sws[t + 1].i0 = sws[t].i1;
1739  }
1740  }
1741  assert(sws[nt - 1].i1 == s1.i1);
1742 
1743  // do the actual merging
1744 #pragma omp parallel for num_threads(nt)
1745  for (int t = 0; t < nt; t++) {
1746  SegmentS sw = sws[t];
1747  SegmentS s1t = s1s[t];
1748  SegmentS s2t = s2s[t];
1749  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1750  for (;;) {
1751  // assert (sw.len() == s1t.len() + s2t.len());
1752  if (comp(src[s1t.i0], src[s2t.i0])) {
1753  dst[sw.i0++] = src[s1t.i0++];
1754  if (s1t.i0 == s1t.i1) break;
1755  } else {
1756  dst[sw.i0++] = src[s2t.i0++];
1757  if (s2t.i0 == s2t.i1) break;
1758  }
1759  }
1760  }
1761  if (s1t.len() > 0) {
1762  assert(s1t.len() == sw.len());
1763  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1764  } else if (s2t.len() > 0) {
1765  assert(s2t.len() == sw.len());
1766  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1767  }
1768  }
1769  }
1770 
1771 };
1772 
1773 void fvec_argsort (size_t n, const float *vals,
1774  size_t *perm)
1775 {
1776  for (size_t i = 0; i < n; i++) perm[i] = i;
1777  ArgsortComparator comp = {vals};
1778  std::sort (perm, perm + n, comp);
1779 }
1780 
1781 void fvec_argsort_parallel (size_t n, const float *vals,
1782  size_t *perm)
1783 {
1784  size_t * perm2 = new size_t[n];
1785  // 2 result tables, during merging, flip between them
1786  size_t *permB = perm2, *permA = perm;
1787 
1788  int nt = omp_get_max_threads();
1789  { // prepare correct permutation so that the result ends in perm
1790  // at final iteration
1791  int nseg = nt;
1792  while (nseg > 1) {
1793  nseg = (nseg + 1) / 2;
1794  std::swap (permA, permB);
1795  }
1796  }
1797 
1798 #pragma omp parallel
1799  for (size_t i = 0; i < n; i++) permA[i] = i;
1800 
1801  ArgsortComparator comp = {vals};
1802 
1803  SegmentS segs[nt];
1804 
1805  // independent sorts
1806 #pragma omp parallel for
1807  for (int t = 0; t < nt; t++) {
1808  size_t i0 = t * n / nt;
1809  size_t i1 = (t + 1) * n / nt;
1810  SegmentS seg = {i0, i1};
1811  std::sort (permA + seg.i0, permA + seg.i1, comp);
1812  segs[t] = seg;
1813  }
1814  int prev_nested = omp_get_nested();
1815  omp_set_nested(1);
1816 
1817  int nseg = nt;
1818  while (nseg > 1) {
1819  int nseg1 = (nseg + 1) / 2;
1820  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1821  int sub_nseg1 = nseg / 2;
1822 
1823 #pragma omp parallel for num_threads(nseg1)
1824  for (int s = 0; s < nseg; s += 2) {
1825  if (s + 1 == nseg) { // otherwise isolated segment
1826  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1827  segs[s].len() * sizeof(size_t));
1828  } else {
1829  int t0 = s * sub_nt / sub_nseg1;
1830  int t1 = (s + 1) * sub_nt / sub_nseg1;
1831  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1832  parallel_merge(permA, permB, segs[s], segs[s + 1],
1833  t1 - t0, comp);
1834  }
1835  }
1836  for (int s = 0; s < nseg; s += 2)
1837  segs[s / 2] = segs[s];
1838  nseg = nseg1;
1839  std::swap (permA, permB);
1840  }
1841  assert (permA == perm);
1842  omp_set_nested(prev_nested);
1843  delete [] perm2;
1844 }
1845 
1846 
1847 
1848 
1849 
1850 
1851 
1852 
1853 
1854 
1855 
1856 
1857 
1858 
1859 
1860 
1861 /***************************************************************************
1862  * heavily optimized table computations
1863  ***************************************************************************/
1864 
1865 
1866 static inline void fvec_madd_ref (size_t n, const float *a,
1867  float bf, const float *b, float *c) {
1868  for (size_t i = 0; i < n; i++)
1869  c[i] = a[i] + bf * b[i];
1870 }
1871 
1872 
1873 static inline void fvec_madd_sse (size_t n, const float *a,
1874  float bf, const float *b, float *c) {
1875  n >>= 2;
1876  __m128 bf4 = _mm_set_ps1 (bf);
1877  __m128 * a4 = (__m128*)a;
1878  __m128 * b4 = (__m128*)b;
1879  __m128 * c4 = (__m128*)c;
1880 
1881  while (n--) {
1882  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1883  b4++;
1884  a4++;
1885  c4++;
1886  }
1887 }
1888 
1889 void fvec_madd (size_t n, const float *a,
1890  float bf, const float *b, float *c)
1891 {
1892  if ((n & 3) == 0 &&
1893  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1894  fvec_madd_sse (n, a, bf, b, c);
1895  else
1896  fvec_madd_ref (n, a, bf, b, c);
1897 }
1898 
1899 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1900  float bf, const float *b, float *c) {
1901  float vmin = 1e20;
1902  int imin = -1;
1903 
1904  for (size_t i = 0; i < n; i++) {
1905  c[i] = a[i] + bf * b[i];
1906  if (c[i] < vmin) {
1907  vmin = c[i];
1908  imin = i;
1909  }
1910  }
1911  return imin;
1912 }
1913 
1914 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1915  float bf, const float *b, float *c) {
1916  n >>= 2;
1917  __m128 bf4 = _mm_set_ps1 (bf);
1918  __m128 vmin4 = _mm_set_ps1 (1e20);
1919  __m128i imin4 = _mm_set1_epi32 (-1);
1920  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1921  __m128i inc4 = _mm_set1_epi32 (4);
1922  __m128 * a4 = (__m128*)a;
1923  __m128 * b4 = (__m128*)b;
1924  __m128 * c4 = (__m128*)c;
1925 
1926  while (n--) {
1927  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1928  *c4 = vc4;
1929  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1930  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1931 
1932  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1933  _mm_andnot_si128 (mask, imin4));
1934  vmin4 = _mm_min_ps (vmin4, vc4);
1935  b4++;
1936  a4++;
1937  c4++;
1938  idx4 = _mm_add_epi32 (idx4, inc4);
1939  }
1940 
1941  // 4 values -> 2
1942  {
1943  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1944  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1945  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1946  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1947  _mm_andnot_si128 (mask, imin4));
1948  vmin4 = _mm_min_ps (vmin4, vc4);
1949  }
1950  // 2 values -> 1
1951  {
1952  idx4 = _mm_shuffle_epi32 (imin4, 1);
1953  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1954  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1955  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1956  _mm_andnot_si128 (mask, imin4));
1957  // vmin4 = _mm_min_ps (vmin4, vc4);
1958  }
1959  return _mm_extract_epi32 (imin4, 0);
1960 }
1961 
1962 
1963 int fvec_madd_and_argmin (size_t n, const float *a,
1964  float bf, const float *b, float *c)
1965 {
1966  if ((n & 3) == 0 &&
1967  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1968  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1969  else
1970  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1971 }
1972 
1973 
1974 
1976  size_t d, size_t *n, size_t nmax, const float *x,
1977  bool verbose, long seed)
1978 {
1979 
1980  if (*n <= nmax) return x; // nothing to do
1981 
1982  size_t n2 = nmax;
1983  if (verbose) {
1984  printf (" Input training set too big (max size is %ld), sampling "
1985  "%ld / %ld vectors\n", nmax, n2, *n);
1986  }
1987  std::vector<int> subset (*n);
1988  rand_perm (subset.data (), *n, seed);
1989  float *x_subset = new float[n2 * d];
1990  for (long i = 0; i < n2; i++)
1991  memcpy (&x_subset[i * d],
1992  &x[subset[i] * size_t(d)],
1993  sizeof (x[0]) * d);
1994  *n = n2;
1995  return x_subset;
1996 }
1997 
1998 
1999 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:48
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
Definition: utils.cpp:1401
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:988
RandomGenerator(long seed=1234)
initialize
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:574
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1649
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1975
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1495
float rand_float()
between 0 and 1
Definition: utils.cpp:211
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1889
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1638
long rand_long()
random long &lt; 2 ^ 62
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1511
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1576
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1344
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1282
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:948
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:74
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1614
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:632
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1267
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1322
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1672
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1963
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:968