Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved
10 // -*- c++ -*-
11 
12 #include "utils.h"
13 
14 #include <cstdio>
15 #include <cassert>
16 #include <cstring>
17 #include <cmath>
18 
19 #include <immintrin.h>
20 
21 
22 #include <sys/time.h>
23 #include <sys/types.h>
24 #include <unistd.h>
25 
26 #include <omp.h>
27 
28 
29 #include <algorithm>
30 #include <vector>
31 
32 #include "AuxIndexStructures.h"
33 #include "FaissAssert.h"
34 
35 
36 
37 #ifndef FINTEGER
38 #define FINTEGER long
39 #endif
40 
41 
42 extern "C" {
43 
44 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
45 
46 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
47  n, FINTEGER *k, const float *alpha, const float *a,
48  FINTEGER *lda, const float *b, FINTEGER *
49  ldb, float *beta, float *c, FINTEGER *ldc);
50 
51 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
52 
53 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
54  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
55 
56 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
57  FINTEGER *lda, float *tau, float *work,
58  FINTEGER *lwork, FINTEGER *info);
59 
60 
61 }
62 
63 
64 /**************************************************
65  * Get some stats about the system
66  **************************************************/
67 
68 namespace faiss {
69 
70 double getmillisecs () {
71  struct timeval tv;
72  gettimeofday (&tv, nullptr);
73  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
74 }
75 
76 
77 #ifdef __linux__
78 
79 size_t get_mem_usage_kb ()
80 {
81  int pid = getpid ();
82  char fname[256];
83  snprintf (fname, 256, "/proc/%d/status", pid);
84  FILE * f = fopen (fname, "r");
85  FAISS_THROW_IF_NOT_MSG (f, "cannot open proc status file");
86  size_t sz = 0;
87  for (;;) {
88  char buf [256];
89  if (!fgets (buf, 256, f)) break;
90  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
91  }
92  fclose (f);
93  return sz;
94 }
95 
96 #elif __APPLE__
97 
98 size_t get_mem_usage_kb ()
99 {
100  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
101  return 0;
102 }
103 
104 #endif
105 
106 
107 
108 /**************************************************
109  * Random data generation functions
110  **************************************************/
111 
112 /**
113  * The definition of random functions depends on the architecture:
114  *
115  * - for Linux, we rely on re-entrant functions (random_r). This
116  * provides good quality reproducible random sequences.
117  *
118  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
119  * this function that it removed its definition form stdlib.h, so we
120  * re-declare it below. Fortunately, since it is deprecated, its
121  * prototype should not change much in the forerseeable future.
122  *
123  * Unfortunately, system designers are more concerned with making the
124  * most unpredictable random sequences for cryptographic use, when in
125  * scientific contexts what acutally matters is having reproducible
126  * squences in multi-threaded contexts.
127  */
128 
129 
130 #ifdef __linux__
131 
132 
133 
134 
136 {
137  int32_t a;
138  random_r (&rand_data, &a);
139  return a;
140 }
141 
143 {
144  int32_t a, b;
145  random_r (&rand_data, &a);
146  random_r (&rand_data, &b);
147  return long(a) | long(b) << 31;
148 }
149 
150 
152 {
153  memset (&rand_data, 0, sizeof (rand_data));
154  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
155 }
156 
157 
158 RandomGenerator::RandomGenerator (const RandomGenerator & other)
159 {
160  memcpy (rand_state, other.rand_state, sizeof(rand_state));
161  rand_data = other.rand_data;
162  setstate_r (rand_state, &rand_data);
163 }
164 
165 
166 #elif __APPLE__
167 
168 extern "C" {
169 int rand_r(unsigned *seed);
170 }
171 
173 {
174  rand_state = seed;
175 }
176 
177 
178 RandomGenerator::RandomGenerator (const RandomGenerator & other)
179 {
180  rand_state = other.rand_state;
181 }
182 
183 
185 {
186  // RAND_MAX is 31 bits
187  // try to add more randomness in the lower bits
188  int lowbits = rand_r(&rand_state) >> 15;
189  return rand_r(&rand_state) ^ lowbits;
190 }
191 
193 {
194  return long(random()) | long(random()) << 31;
195 }
196 
197 
198 
199 #endif
200 
202 { // this suffers form non-uniform probabilities when max is not a
203  // power of 2, but if RAND_MAX >> max the bias is limited.
204  return rand_int () % max;
205 }
206 
208 {
209  return rand_int() / float(1L << 31);
210 }
211 
212 double RandomGenerator::rand_double ()
213 {
214  return rand_long() / double(1L << 62);
215 }
216 
217 
218 /***********************************************************************
219  * Random functions in this C file only exist because Torch
220  * counterparts are slow and not multi-threaded. Typical use is for
221  * more than 1-100 billion values. */
222 
223 
224 /* Generate a set of random floating point values such that x[i] in [0,1]
225  multi-threading. For this reason, we rely on re-entreant functions. */
226 void float_rand (float * x, size_t n, long seed)
227 {
228  // only try to parallelize on large enough arrays
229  const size_t nblock = n < 1024 ? 1 : 1024;
230 
231  RandomGenerator rng0 (seed);
232  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
233 
234 #pragma omp parallel for
235  for (size_t j = 0; j < nblock; j++) {
236 
237  RandomGenerator rng (a0 + j * b0);
238 
239  const size_t istart = j * n / nblock;
240  const size_t iend = (j + 1) * n / nblock;
241 
242  for (size_t i = istart; i < iend; i++)
243  x[i] = rng.rand_float ();
244  }
245 }
246 
247 
248 void float_randn (float * x, size_t n, long seed)
249 {
250  // only try to parallelize on large enough arrays
251  const size_t nblock = n < 1024 ? 1 : 1024;
252 
253  RandomGenerator rng0 (seed);
254  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
255 
256 #pragma omp parallel for
257  for (size_t j = 0; j < nblock; j++) {
258  RandomGenerator rng (a0 + j * b0);
259 
260  double a = 0, b = 0, s = 0;
261  int state = 0; /* generate two number per "do-while" loop */
262 
263  const size_t istart = j * n / nblock;
264  const size_t iend = (j + 1) * n / nblock;
265 
266  for (size_t i = istart; i < iend; i++) {
267  /* Marsaglia's method (see Knuth) */
268  if (state == 0) {
269  do {
270  a = 2.0 * rng.rand_double () - 1;
271  b = 2.0 * rng.rand_double () - 1;
272  s = a * a + b * b;
273  } while (s >= 1.0);
274  x[i] = a * sqrt(-2.0 * log(s) / s);
275  }
276  else
277  x[i] = b * sqrt(-2.0 * log(s) / s);
278  state = 1 - state;
279  }
280  }
281 }
282 
283 
284 /* Integer versions */
285 void long_rand (long * x, size_t n, long seed)
286 {
287  // only try to parallelize on large enough arrays
288  const size_t nblock = n < 1024 ? 1 : 1024;
289 
290  RandomGenerator rng0 (seed);
291  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
292 
293 #pragma omp parallel for
294  for (size_t j = 0; j < nblock; j++) {
295 
296  RandomGenerator rng (a0 + j * b0);
297 
298  const size_t istart = j * n / nblock;
299  const size_t iend = (j + 1) * n / nblock;
300  for (size_t i = istart; i < iend; i++)
301  x[i] = rng.rand_long ();
302  }
303 }
304 
305 
306 
307 void rand_perm (int *perm, size_t n, long seed)
308 {
309  for (size_t i = 0; i < n; i++) perm[i] = i;
310 
311  RandomGenerator rng (seed);
312 
313  for (size_t i = 0; i + 1 < n; i++) {
314  int i2 = i + rng.rand_int (n - i);
315  std::swap(perm[i], perm[i2]);
316  }
317 }
318 
319 
320 
321 
322 void byte_rand (uint8_t * x, size_t n, long seed)
323 {
324  // only try to parallelize on large enough arrays
325  const size_t nblock = n < 1024 ? 1 : 1024;
326 
327  RandomGenerator rng0 (seed);
328  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
329 
330 #pragma omp parallel for
331  for (size_t j = 0; j < nblock; j++) {
332 
333  RandomGenerator rng (a0 + j * b0);
334 
335  const size_t istart = j * n / nblock;
336  const size_t iend = (j + 1) * n / nblock;
337 
338  size_t i;
339  for (i = istart; i < iend; i++)
340  x[i] = rng.rand_long ();
341  }
342 }
343 
344 
345 
346 void reflection (const float * __restrict u,
347  float * __restrict x,
348  size_t n, size_t d, size_t nu)
349 {
350  size_t i, j, l;
351  for (i = 0; i < n; i++) {
352  const float * up = u;
353  for (l = 0; l < nu; l++) {
354  float ip1 = 0, ip2 = 0;
355 
356  for (j = 0; j < d; j+=2) {
357  ip1 += up[j] * x[j];
358  ip2 += up[j+1] * x[j+1];
359  }
360  float ip = 2 * (ip1 + ip2);
361 
362  for (j = 0; j < d; j++)
363  x[j] -= ip * up[j];
364  up += d;
365  }
366  x += d;
367  }
368 }
369 
370 
371 /* Reference implementation (slower) */
372 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
373 {
374  size_t i, j, l;
375  for (i = 0; i < n; i++) {
376  const float * up = u;
377  for (l = 0; l < nu; l++) {
378  double ip = 0;
379 
380  for (j = 0; j < d; j++)
381  ip += up[j] * x[j];
382  ip *= 2;
383 
384  for (j = 0; j < d; j++)
385  x[j] -= ip * up[j];
386 
387  up += d;
388  }
389  x += d;
390  }
391 }
392 
393 /*********************************************************
394  * Optimized distance computations
395  *********************************************************/
396 
397 
398 
399 /* Functions to compute:
400  - L2 distance between 2 vectors
401  - inner product between 2 vectors
402  - L2 norm of a vector
403 
404  The functions should probably not be invoked when a large number of
405  vectors are be processed in batch (in which case Matrix multiply
406  is faster), but may be useful for comparing vectors isolated in
407  memory.
408 
409  Works with any vectors of any dimension, even unaligned (in which
410  case they are slower).
411 
412 */
413 
414 
415 /*********************************************************
416  * Reference implementations
417  */
418 
419 
420 
421 /* same without SSE */
422 float fvec_L2sqr_ref (const float * x,
423  const float * y,
424  size_t d)
425 {
426  size_t i;
427  float res_ = 0;
428  for (i = 0; i < d; i++) {
429  const float tmp = x[i] - y[i];
430  res_ += tmp * tmp;
431  }
432  return res_;
433 }
434 
435 float fvec_inner_product_ref (const float * x,
436  const float * y,
437  size_t d)
438 {
439  size_t i;
440  float res_ = 0;
441  for (i = 0; i < d; i++)
442  res_ += x[i] * y[i];
443  return res_;
444 }
445 
446 float fvec_norm_L2sqr_ref (const float * __restrict x,
447  size_t d)
448 {
449  size_t i;
450  double res_ = 0;
451  for (i = 0; i < d; i++)
452  res_ += x[i] * x[i];
453  return res_;
454 }
455 
456 
457 /*********************************************************
458  * SSE implementations
459  */
460 
461 // reads 0 <= d < 4 floats as __m128
462 static inline __m128 masked_read (int d, const float *x)
463 {
464  assert (0 <= d && d < 4);
465  __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
466  switch (d) {
467  case 3:
468  buf[2] = x[2];
469  case 2:
470  buf[1] = x[1];
471  case 1:
472  buf[0] = x[0];
473  }
474  return _mm_load_ps (buf);
475  // cannot use AVX2 _mm_mask_set1_epi32
476 }
477 
478 
479 
480 /* SSE-implementation of L2 distance */
481 float fvec_L2sqr (const float * x,
482  const float * y,
483  size_t d)
484 {
485  __m128 msum1 = _mm_setzero_ps();
486 
487  while (d >= 4) {
488  __m128 mx = _mm_loadu_ps (x); x += 4;
489  __m128 my = _mm_loadu_ps (y); y += 4;
490  const __m128 a_m_b1 = mx - my;
491  msum1 += a_m_b1 * a_m_b1;
492  d -= 4;
493  }
494 
495  if (d > 0) {
496  // add the last 1, 2 or 3 values
497  __m128 mx = masked_read (d, x);
498  __m128 my = masked_read (d, y);
499  __m128 a_m_b1 = mx - my;
500  msum1 += a_m_b1 * a_m_b1;
501  }
502 
503  msum1 = _mm_hadd_ps (msum1, msum1);
504  msum1 = _mm_hadd_ps (msum1, msum1);
505  return _mm_cvtss_f32 (msum1);
506 }
507 
508 
509 float fvec_inner_product (const float * x,
510  const float * y,
511  size_t d)
512 {
513  __m128 mx, my;
514  __m128 msum1 = _mm_setzero_ps();
515 
516  while (d >= 4) {
517  mx = _mm_loadu_ps (x); x += 4;
518  my = _mm_loadu_ps (y); y += 4;
519  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
520  d -= 4;
521  }
522 
523  // add the last 1, 2, or 3 values
524  mx = masked_read (d, x);
525  my = masked_read (d, y);
526  __m128 prod = _mm_mul_ps (mx, my);
527 
528  msum1 = _mm_add_ps (msum1, prod);
529 
530  msum1 = _mm_hadd_ps (msum1, msum1);
531  msum1 = _mm_hadd_ps (msum1, msum1);
532  return _mm_cvtss_f32 (msum1);
533 }
534 
535 
536 
537 
538 float fvec_norm_L2sqr (const float * x,
539  size_t d)
540 {
541  __m128 mx;
542  __m128 msum1 = _mm_setzero_ps();
543 
544  while (d >= 4) {
545  mx = _mm_loadu_ps (x); x += 4;
546  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
547  d -= 4;
548  }
549 
550  mx = masked_read (d, x);
551  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
552 
553  msum1 = _mm_hadd_ps (msum1, msum1);
554  msum1 = _mm_hadd_ps (msum1, msum1);
555  return _mm_cvtss_f32 (msum1);
556 }
557 
558 
559 
560 /*********************************************************
561  * AVX implementations
562  *
563  * Disabled for now, it is not faster than SSE on current machines
564  * see P57425519
565  */
566 
567 #if 0
568 
569 
570 
571 // reads 0 <= d < 8 floats as __m256
572 static inline __m256 masked_read_8 (int d, const float *x)
573 {
574  assert (0 <= d && d < 8);
575  if (d < 4) {
576  __m256 res = _mm256_setzero_ps ();
577  res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
578  return res;
579  } else {
580  __m256 res;
581  res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
582  res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
583  return res;
584  }
585 }
586 
587 
588 float fvec_L2sqr (const float * x,
589  const float * y,
590  size_t d)
591 {
592  __m256 msum1 = _mm256_setzero_ps();
593 
594  while (d >= 8) {
595  __m256 mx = _mm256_loadu_ps (x); x += 8;
596  __m256 my = _mm256_loadu_ps (y); y += 8;
597  const __m256 a_m_b1 = mx - my;
598  msum1 += a_m_b1 * a_m_b1;
599  d -= 8;
600  }
601 
602  if (d > 0) {
603  // add the last 1, 2 or 3 values
604  __m256 mx = masked_read_8 (d, x);
605  __m256 my = masked_read_8 (d, y);
606  __m256 a_m_b1 = mx - my;
607  msum1 += a_m_b1 * a_m_b1;
608  }
609 
610  __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
611  msum2 += _mm256_extractf128_ps(msum1, 0);
612 
613  msum2 = _mm_hadd_ps (msum2, msum2);
614  msum2 = _mm_hadd_ps (msum2, msum2);
615  return _mm_cvtss_f32 (msum2);
616 }
617 
618 #endif
619 
620 
621 
622 
623 
624 /***************************************************************************
625  * Matrix/vector ops
626  ***************************************************************************/
627 
628 
629 
630 /* Compute the inner product between a vector x and
631  a set of ny vectors y.
632  These functions are not intended to replace BLAS matrix-matrix, as they
633  would be significantly less efficient in this case. */
634 void fvec_inner_products_ny (float * __restrict ip,
635  const float * x,
636  const float * y,
637  size_t d, size_t ny)
638 {
639  for (size_t i = 0; i < ny; i++) {
640  ip[i] = fvec_inner_product (x, y, d);
641  y += d;
642  }
643 }
644 
645 
646 
647 
648 /* compute ny L2 distances between x and a set of vectors y */
649 void fvec_L2sqr_ny (float * __restrict dis,
650  const float * x,
651  const float * y,
652  size_t d, size_t ny)
653 {
654  for (size_t i = 0; i < ny; i++) {
655  dis[i] = fvec_L2sqr (x, y, d);
656  y += d;
657  }
658 }
659 
660 
661 
662 
663 /* Compute the L2 norm of a set of nx vectors */
664 void fvec_norms_L2 (float * __restrict nr,
665  const float * __restrict x,
666  size_t d, size_t nx)
667 {
668 
669 #pragma omp parallel for
670  for (size_t i = 0; i < nx; i++) {
671  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
672  }
673 }
674 
675 void fvec_norms_L2sqr (float * __restrict nr,
676  const float * __restrict x,
677  size_t d, size_t nx)
678 {
679 #pragma omp parallel for
680  for (size_t i = 0; i < nx; i++)
681  nr[i] = fvec_norm_L2sqr (x + i * d, d);
682 }
683 
684 
685 
686 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
687 {
688 #pragma omp parallel for
689  for (size_t i = 0; i < nx; i++) {
690  float * __restrict xi = x + i * d;
691 
692  float nr = fvec_norm_L2sqr (xi, d);
693 
694  if (nr > 0) {
695  size_t j;
696  const float inv_nr = 1.0 / sqrtf (nr);
697  for (j = 0; j < d; j++)
698  xi[j] *= inv_nr;
699  }
700  }
701 }
702 
703 
704 
705 
706 
707 
708 
709 
710 
711 
712 
713 
714 
715 
716 
717 
718 
719 /***************************************************************************
720  * KNN functions
721  ***************************************************************************/
722 
723 
724 
725 /* Find the nearest neighbors for nx queries in a set of ny vectors */
726 static void knn_inner_product_sse (const float * x,
727  const float * y,
728  size_t d, size_t nx, size_t ny,
729  float_minheap_array_t * res)
730 {
731  size_t k = res->k;
732 
733 #pragma omp parallel for
734  for (size_t i = 0; i < nx; i++) {
735  const float * x_ = x + i * d;
736  const float * y_ = y;
737 
738  float * __restrict simi = res->get_val(i);
739  long * __restrict idxi = res->get_ids (i);
740 
741  minheap_heapify (k, simi, idxi);
742 
743  for (size_t j = 0; j < ny; j++) {
744  float ip = fvec_inner_product (x_, y_, d);
745 
746  if (ip > simi[0]) {
747  minheap_pop (k, simi, idxi);
748  minheap_push (k, simi, idxi, ip, j);
749  }
750  y_ += d;
751  }
752  minheap_reorder (k, simi, idxi);
753  }
754 
755 }
756 
757 static void knn_L2sqr_sse (
758  const float * x,
759  const float * y,
760  size_t d, size_t nx, size_t ny,
761  float_maxheap_array_t * res)
762 {
763  size_t k = res->k;
764 
765 #pragma omp parallel for
766  for (size_t i = 0; i < nx; i++) {
767  const float * x_ = x + i * d;
768  const float * y_ = y;
769  size_t j;
770  float * __restrict simi = res->get_val(i);
771  long * __restrict idxi = res->get_ids (i);
772 
773  maxheap_heapify (k, simi, idxi);
774  for (j = 0; j < ny; j++) {
775  float disij = fvec_L2sqr (x_, y_, d);
776 
777  if (disij < simi[0]) {
778  maxheap_pop (k, simi, idxi);
779  maxheap_push (k, simi, idxi, disij, j);
780  }
781  y_ += d;
782  }
783  maxheap_reorder (k, simi, idxi);
784  }
785 
786 }
787 
788 
789 /** Find the nearest neighbors for nx queries in a set of ny vectors */
790 static void knn_inner_product_blas (
791  const float * x,
792  const float * y,
793  size_t d, size_t nx, size_t ny,
794  float_minheap_array_t * res)
795 {
796  res->heapify ();
797 
798  // BLAS does not like empty matrices
799  if (nx == 0 || ny == 0) return;
800 
801  /* block sizes */
802  const size_t bs_x = 4096, bs_y = 1024;
803  // const size_t bs_x = 16, bs_y = 16;
804  float *ip_block = new float[bs_x * bs_y];
805 
806  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
807  size_t i1 = i0 + bs_x;
808  if(i1 > nx) i1 = nx;
809 
810  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
811  size_t j1 = j0 + bs_y;
812  if (j1 > ny) j1 = ny;
813  /* compute the actual dot products */
814  {
815  float one = 1, zero = 0;
816  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
817  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
818  y + j0 * d, &di,
819  x + i0 * d, &di, &zero,
820  ip_block, &nyi);
821  }
822 
823  /* collect maxima */
824  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
825  }
826  }
827  delete [] ip_block;
828  res->reorder ();
829 }
830 
831 // distance correction is an operator that can be applied to transform
832 // the distances
833 template<class DistanceCorrection>
834 static void knn_L2sqr_blas (const float * x,
835  const float * y,
836  size_t d, size_t nx, size_t ny,
837  float_maxheap_array_t * res,
838  const DistanceCorrection &corr)
839 {
840  res->heapify ();
841 
842  // BLAS does not like empty matrices
843  if (nx == 0 || ny == 0) return;
844 
845  size_t k = res->k;
846 
847  /* block sizes */
848  const size_t bs_x = 4096, bs_y = 1024;
849  // const size_t bs_x = 16, bs_y = 16;
850  float *ip_block = new float[bs_x * bs_y];
851 
852  float *x_norms = new float[nx];
853  fvec_norms_L2sqr (x_norms, x, d, nx);
854 
855  float *y_norms = new float[ny];
856  fvec_norms_L2sqr (y_norms, y, d, ny);
857 
858  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
859  size_t i1 = i0 + bs_x;
860  if(i1 > nx) i1 = nx;
861 
862  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
863  size_t j1 = j0 + bs_y;
864  if (j1 > ny) j1 = ny;
865  /* compute the actual dot products */
866  {
867  float one = 1, zero = 0;
868  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
869  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
870  y + j0 * d, &di,
871  x + i0 * d, &di, &zero,
872  ip_block, &nyi);
873  }
874 
875  /* collect minima */
876 #pragma omp parallel for
877  for (size_t i = i0; i < i1; i++) {
878  float * __restrict simi = res->get_val(i);
879  long * __restrict idxi = res->get_ids (i);
880  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
881 
882  for (size_t j = j0; j < j1; j++) {
883  float ip = *ip_line++;
884  float dis = x_norms[i] + y_norms[j] - 2 * ip;
885 
886  dis = corr (dis, i, j);
887 
888  if (dis < simi[0]) {
889  maxheap_pop (k, simi, idxi);
890  maxheap_push (k, simi, idxi, dis, j);
891  }
892  }
893  }
894  }
895  }
896  res->reorder ();
897 
898  delete [] ip_block;
899  delete [] x_norms;
900  delete [] y_norms;
901 }
902 
903 
904 
905 
906 
907 
908 
909 
910 
911 /*******************************************************
912  * KNN driver functions
913  *******************************************************/
914 
915 void knn_inner_product (const float * x,
916  const float * y,
917  size_t d, size_t nx, size_t ny,
918  float_minheap_array_t * res)
919 {
920  if (d % 4 == 0 && nx < 20) {
921  knn_inner_product_sse (x, y, d, nx, ny, res);
922  } else {
923  knn_inner_product_blas (x, y, d, nx, ny, res);
924  }
925 }
926 
927 
928 
930  float operator()(float dis, size_t /*qno*/, size_t /*bno*/) const {
931  return dis;
932  }
933 };
934 
935 void knn_L2sqr (const float * x,
936  const float * y,
937  size_t d, size_t nx, size_t ny,
938  float_maxheap_array_t * res)
939 {
940  if (d % 4 == 0 && nx < 20) {
941  knn_L2sqr_sse (x, y, d, nx, ny, res);
942  } else {
944  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
945  }
946 }
947 
949  const float *base_shift;
950  float operator()(float dis, size_t /*qno*/, size_t bno) const {
951  return dis - base_shift[bno];
952  }
953 };
954 
956  const float * x,
957  const float * y,
958  size_t d, size_t nx, size_t ny,
959  float_maxheap_array_t * res,
960  const float *base_shift)
961 {
962  BaseShiftDistanceCorrection corr = {base_shift};
963  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
964 }
965 
966 
967 
968 /***************************************************************************
969  * compute a subset of distances
970  ***************************************************************************/
971 
972 /* compute the inner product between x and a subset y of ny vectors,
973  whose indices are given by idy. */
974 void fvec_inner_products_by_idx (float * __restrict ip,
975  const float * x,
976  const float * y,
977  const long * __restrict ids, /* for y vecs */
978  size_t d, size_t nx, size_t ny)
979 {
980 #pragma omp parallel for
981  for (size_t j = 0; j < nx; j++) {
982  const long * __restrict idsj = ids + j * ny;
983  const float * xj = x + j * d;
984  float * __restrict ipj = ip + j * ny;
985  for (size_t i = 0; i < ny; i++) {
986  if (idsj[i] < 0)
987  continue;
988  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
989  }
990  }
991 }
992 
993 /* compute the inner product between x and a subset y of ny vectors,
994  whose indices are given by idy. */
995 void fvec_L2sqr_by_idx (float * __restrict dis,
996  const float * x,
997  const float * y,
998  const long * __restrict ids, /* ids of y vecs */
999  size_t d, size_t nx, size_t ny)
1000 {
1001 #pragma omp parallel for
1002  for (size_t j = 0; j < nx; j++) {
1003  const long * __restrict idsj = ids + j * ny;
1004  const float * xj = x + j * d;
1005  float * __restrict disj = dis + j * ny;
1006  for (size_t i = 0; i < ny; i++) {
1007  if (idsj[i] < 0)
1008  continue;
1009  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
1010  }
1011  }
1012 }
1013 
1014 
1015 
1016 
1017 
1018 /* Find the nearest neighbors for nx queries in a set of ny vectors
1019  indexed by ids. May be useful for re-ranking a pre-selected vector list */
1020 void knn_inner_products_by_idx (const float * x,
1021  const float * y,
1022  const long * ids,
1023  size_t d, size_t nx, size_t ny,
1024  float_minheap_array_t * res)
1025 {
1026  size_t k = res->k;
1027 
1028 #pragma omp parallel for
1029  for (size_t i = 0; i < nx; i++) {
1030  const float * x_ = x + i * d;
1031  const long * idsi = ids + i * ny;
1032  size_t j;
1033  float * __restrict simi = res->get_val(i);
1034  long * __restrict idxi = res->get_ids (i);
1035  minheap_heapify (k, simi, idxi);
1036 
1037  for (j = 0; j < ny; j++) {
1038  if (idsi[j] < 0) break;
1039  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
1040 
1041  if (ip > simi[0]) {
1042  minheap_pop (k, simi, idxi);
1043  minheap_push (k, simi, idxi, ip, idsi[j]);
1044  }
1045  }
1046  minheap_reorder (k, simi, idxi);
1047  }
1048 
1049 }
1050 
1051 void knn_L2sqr_by_idx (const float * x,
1052  const float * y,
1053  const long * __restrict ids,
1054  size_t d, size_t nx, size_t ny,
1055  float_maxheap_array_t * res)
1056 {
1057  size_t k = res->k;
1058 
1059 #pragma omp parallel for
1060  for (size_t i = 0; i < nx; i++) {
1061  const float * x_ = x + i * d;
1062  const long * __restrict idsi = ids + i * ny;
1063  float * __restrict simi = res->get_val(i);
1064  long * __restrict idxi = res->get_ids (i);
1065  maxheap_heapify (res->k, simi, idxi);
1066  for (size_t j = 0; j < ny; j++) {
1067  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
1068 
1069  if (disij < simi[0]) {
1070  maxheap_pop (k, simi, idxi);
1071  maxheap_push (k, simi, idxi, disij, idsi[j]);
1072  }
1073  }
1074  maxheap_reorder (res->k, simi, idxi);
1075  }
1076 
1077 }
1078 
1079 
1080 
1081 
1082 
1083 /***************************************************************************
1084  * Range search
1085  ***************************************************************************/
1086 
1087 /** Find the nearest neighbors for nx queries in a set of ny vectors
1088  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1089  */
1090 template <bool compute_l2>
1091 static void range_search_blas (
1092  const float * x,
1093  const float * y,
1094  size_t d, size_t nx, size_t ny,
1095  float radius,
1096  RangeSearchResult *result)
1097 {
1098 
1099  // BLAS does not like empty matrices
1100  if (nx == 0 || ny == 0) return;
1101 
1102  /* block sizes */
1103  const size_t bs_x = 4096, bs_y = 1024;
1104  // const size_t bs_x = 16, bs_y = 16;
1105  float *ip_block = new float[bs_x * bs_y];
1106 
1107  float *x_norms = nullptr, *y_norms = nullptr;
1108 
1109  if (compute_l2) {
1110  x_norms = new float[nx];
1111  fvec_norms_L2sqr (x_norms, x, d, nx);
1112  y_norms = new float[ny];
1113  fvec_norms_L2sqr (y_norms, y, d, ny);
1114  }
1115 
1116  std::vector <RangeSearchPartialResult *> partial_results;
1117 
1118  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1119  size_t j1 = j0 + bs_y;
1120  if (j1 > ny) j1 = ny;
1121  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1122  partial_results.push_back (pres);
1123 
1124  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1125  size_t i1 = i0 + bs_x;
1126  if(i1 > nx) i1 = nx;
1127 
1128  /* compute the actual dot products */
1129  {
1130  float one = 1, zero = 0;
1131  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1132  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1133  y + j0 * d, &di,
1134  x + i0 * d, &di, &zero,
1135  ip_block, &nyi);
1136  }
1137 
1138 
1139  for (size_t i = i0; i < i1; i++) {
1140  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1141 
1142  RangeSearchPartialResult::QueryResult & qres =
1143  pres->new_result (i);
1144 
1145  for (size_t j = j0; j < j1; j++) {
1146  float ip = *ip_line++;
1147  if (compute_l2) {
1148  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1149  if (dis < radius) {
1150  qres.add (dis, j);
1151  }
1152  } else {
1153  if (ip > radius) {
1154  qres.add (ip, j);
1155  }
1156  }
1157  }
1158  }
1159  }
1160 
1161  }
1162  delete [] ip_block;
1163  delete [] x_norms;
1164  delete [] y_norms;
1165 
1166  { // merge the partial results
1167  int npres = partial_results.size();
1168  // count
1169  for (size_t i = 0; i < nx; i++) {
1170  for (int j = 0; j < npres; j++)
1171  result->lims[i] += partial_results[j]->queries[i].nres;
1172  }
1173  result->do_allocation ();
1174  for (int j = 0; j < npres; j++) {
1175  partial_results[j]->set_result (true);
1176  delete partial_results[j];
1177  }
1178 
1179  // reset the limits
1180  for (size_t i = nx; i > 0; i--) {
1181  result->lims [i] = result->lims [i - 1];
1182  }
1183  result->lims [0] = 0;
1184  }
1185 }
1186 
1187 
1188 template <bool compute_l2>
1189 static void range_search_sse (const float * x,
1190  const float * y,
1191  size_t d, size_t nx, size_t ny,
1192  float radius,
1193  RangeSearchResult *res)
1194 {
1195  FAISS_THROW_IF_NOT (d % 4 == 0);
1196 
1197 #pragma omp parallel
1198  {
1199  RangeSearchPartialResult pres (res);
1200 
1201 #pragma omp for
1202  for (size_t i = 0; i < nx; i++) {
1203  const float * x_ = x + i * d;
1204  const float * y_ = y;
1205  size_t j;
1206 
1207  RangeSearchPartialResult::QueryResult & qres =
1208  pres.new_result (i);
1209 
1210  for (j = 0; j < ny; j++) {
1211  if (compute_l2) {
1212  float disij = fvec_L2sqr (x_, y_, d);
1213  if (disij < radius) {
1214  qres.add (disij, j);
1215  }
1216  } else {
1217  float ip = fvec_inner_product (x_, y_, d);
1218  if (ip > radius) {
1219  qres.add (ip, j);
1220  }
1221  }
1222  y_ += d;
1223  }
1224 
1225  }
1226  pres.finalize ();
1227  }
1228 }
1229 
1230 
1231 
1232 
1233 
1235  const float * x,
1236  const float * y,
1237  size_t d, size_t nx, size_t ny,
1238  float radius,
1239  RangeSearchResult *res)
1240 {
1241 
1242  if (d % 4 == 0 && nx < 20) {
1243  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1244  } else {
1245  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1246  }
1247 }
1248 
1250  const float * x,
1251  const float * y,
1252  size_t d, size_t nx, size_t ny,
1253  float radius,
1254  RangeSearchResult *res)
1255 {
1256 
1257  if (d % 4 == 0 && nx < 20) {
1258  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1259  } else {
1260  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1261  }
1262 }
1263 
1264 
1265 
1266 /***************************************************************************
1267  * Some matrix manipulation functions
1268  ***************************************************************************/
1269 
1270 
1271 /* This function exists because the Torch counterpart is extremly slow
1272  (not multi-threaded + unexpected overhead even in single thread).
1273  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1274 void inner_product_to_L2sqr (float * __restrict dis,
1275  const float * nr1,
1276  const float * nr2,
1277  size_t n1, size_t n2)
1278 {
1279 
1280 #pragma omp parallel for
1281  for (size_t j = 0 ; j < n1 ; j++) {
1282  float * disj = dis + j * n2;
1283  for (size_t i = 0 ; i < n2 ; i++)
1284  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1285  }
1286 }
1287 
1288 
1289 void matrix_qr (int m, int n, float *a)
1290 {
1291  FAISS_THROW_IF_NOT (m >= n);
1292  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1293  std::vector<float> tau (ki);
1294  FINTEGER lwork = -1, info;
1295  float work_size;
1296 
1297  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1298  &work_size, &lwork, &info);
1299  lwork = size_t(work_size);
1300  std::vector<float> work (lwork);
1301 
1302  sgeqrf_ (&mi, &ni, a, &mi,
1303  tau.data(), work.data(), &lwork, &info);
1304 
1305  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1306  work.data(), &lwork, &info);
1307 
1308 }
1309 
1310 
1311 void pairwise_L2sqr (long d,
1312  long nq, const float *xq,
1313  long nb, const float *xb,
1314  float *dis,
1315  long ldq, long ldb, long ldd)
1316 {
1317  if (nq == 0 || nb == 0) return;
1318  if (ldq == -1) ldq = d;
1319  if (ldb == -1) ldb = d;
1320  if (ldd == -1) ldd = nb;
1321 
1322  // store in beginning of distance matrix to avoid malloc
1323  float *b_norms = dis;
1324 
1325 #pragma omp parallel for
1326  for (long i = 0; i < nb; i++)
1327  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1328 
1329 #pragma omp parallel for
1330  for (long i = 1; i < nq; i++) {
1331  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1332  for (long j = 0; j < nb; j++)
1333  dis[i * ldd + j] = q_norm + b_norms [j];
1334  }
1335 
1336  {
1337  float q_norm = fvec_norm_L2sqr (xq, d);
1338  for (long j = 0; j < nb; j++)
1339  dis[j] += q_norm;
1340  }
1341 
1342  {
1343  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1344  float one = 1.0, minus_2 = -2.0;
1345 
1346  sgemm_ ("Transposed", "Not transposed",
1347  &nbi, &nqi, &di,
1348  &minus_2,
1349  xb, &ldbi,
1350  xq, &ldqi,
1351  &one, dis, &lddi);
1352  }
1353 
1354 
1355 }
1356 
1357 
1358 
1359 /***************************************************************************
1360  * Kmeans subroutine
1361  ***************************************************************************/
1362 
1363 // a bit above machine epsilon for float16
1364 
1365 #define EPS (1 / 1024.)
1366 
1367 /* For k-means, compute centroids given assignment of vectors to centroids */
1368 /* NOTE: This could be multi-threaded (use histogram of indexes) */
1369 int km_update_centroids (const float * x,
1370  float * centroids,
1371  long * assign,
1372  size_t d, size_t k, size_t n)
1373 {
1374  std::vector<size_t> hassign(k);
1375  memset (centroids, 0, sizeof(*centroids) * d * k);
1376 
1377 
1378 #pragma omp parallel
1379  {
1380  int nt = omp_get_num_threads();
1381  int rank = omp_get_thread_num();
1382  // this thread is taking care of centroids c0:c1
1383  size_t c0 = (k * rank) / nt;
1384  size_t c1 = (k * (rank + 1)) / nt;
1385  const float *xi = x;
1386  // printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
1387  size_t nacc = 0;
1388 
1389  for (size_t i = 0; i < n; i++) {
1390  long ci = assign[i];
1391  assert (ci >= 0 && ci < k);
1392  if (ci >= c0 && ci < c1) {
1393  float * c = centroids + ci * d;
1394  hassign[ci]++;
1395  for (size_t j = 0; j < d; j++)
1396  c[j] += xi[j];
1397  nacc++;
1398  }
1399  xi += d;
1400  }
1401  // printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
1402 
1403  }
1404 
1405 #pragma omp parallel for
1406  for (size_t ci = 0; ci < k; ci++) {
1407  float * c = centroids + ci * d;
1408  float ni = (float) hassign[ci];
1409  if (ni != 0) {
1410  for (size_t j = 0; j < d; j++)
1411  c[j] /= ni;
1412  }
1413  }
1414 
1415  /* Take care of void clusters */
1416  size_t nsplit = 0;
1417  RandomGenerator rng (1234);
1418  for (size_t ci = 0; ci < k; ci++) {
1419  if (hassign[ci] == 0) { /* need to redefine a centroid */
1420  size_t cj;
1421  for (cj = 0; 1; cj = (cj + 1) % k) {
1422  /* probability to pick this cluster for split */
1423  float p = (hassign[cj] - 1.0) / (float) (n - k);
1424  float r = rng.rand_float ();
1425  if (r < p) {
1426  break; /* found our cluster to be split */
1427  }
1428  }
1429  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1430 
1431  /* small symmetric pertubation. Much better than */
1432  for (size_t j = 0; j < d; j++) {
1433  if (j % 2 == 0) {
1434  centroids[ci * d + j] *= 1 + EPS;
1435  centroids[cj * d + j] *= 1 - EPS;
1436  } else {
1437  centroids[ci * d + j] *= 1 - EPS;
1438  centroids[cj * d + j] *= 1 + EPS;
1439  }
1440  }
1441 
1442  /* assume even split of the cluster */
1443  hassign[ci] = hassign[cj] / 2;
1444  hassign[cj] -= hassign[ci];
1445  nsplit++;
1446  }
1447  }
1448 
1449  return nsplit;
1450 }
1451 
1452 #undef EPS
1453 
1454 
1455 
1456 /***************************************************************************
1457  * Result list routines
1458  ***************************************************************************/
1459 
1460 
1461 void ranklist_handle_ties (int k, long *idx, const float *dis)
1462 {
1463  float prev_dis = -1e38;
1464  int prev_i = -1;
1465  for (int i = 0; i < k; i++) {
1466  if (dis[i] != prev_dis) {
1467  if (i > prev_i + 1) {
1468  // sort between prev_i and i - 1
1469  std::sort (idx + prev_i, idx + i);
1470  }
1471  prev_i = i;
1472  prev_dis = dis[i];
1473  }
1474  }
1475 }
1476 
1477 size_t merge_result_table_with (size_t n, size_t k,
1478  long *I0, float *D0,
1479  const long *I1, const float *D1,
1480  bool keep_min,
1481  long translation)
1482 {
1483  size_t n1 = 0;
1484 
1485 #pragma omp parallel reduction(+:n1)
1486  {
1487  std::vector<long> tmpI (k);
1488  std::vector<float> tmpD (k);
1489 
1490 #pragma omp for
1491  for (size_t i = 0; i < n; i++) {
1492  long *lI0 = I0 + i * k;
1493  float *lD0 = D0 + i * k;
1494  const long *lI1 = I1 + i * k;
1495  const float *lD1 = D1 + i * k;
1496  size_t r0 = 0;
1497  size_t r1 = 0;
1498 
1499  if (keep_min) {
1500  for (size_t j = 0; j < k; j++) {
1501 
1502  if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1503  tmpD[j] = lD0[r0];
1504  tmpI[j] = lI0[r0];
1505  r0++;
1506  } else if (lD1[r1] >= 0) {
1507  tmpD[j] = lD1[r1];
1508  tmpI[j] = lI1[r1] + translation;
1509  r1++;
1510  } else { // both are NaNs
1511  tmpD[j] = NAN;
1512  tmpI[j] = -1;
1513  }
1514  }
1515  } else {
1516  for (size_t j = 0; j < k; j++) {
1517  if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1518  tmpD[j] = lD0[r0];
1519  tmpI[j] = lI0[r0];
1520  r0++;
1521  } else if (lD1[r1] >= 0) {
1522  tmpD[j] = lD1[r1];
1523  tmpI[j] = lI1[r1] + translation;
1524  r1++;
1525  } else { // both are NaNs
1526  tmpD[j] = NAN;
1527  tmpI[j] = -1;
1528  }
1529  }
1530  }
1531  n1 += r1;
1532  memcpy (lD0, tmpD.data(), sizeof (lD0[0]) * k);
1533  memcpy (lI0, tmpI.data(), sizeof (lI0[0]) * k);
1534  }
1535  }
1536 
1537  return n1;
1538 }
1539 
1540 
1541 
1542 size_t ranklist_intersection_size (size_t k1, const long *v1,
1543  size_t k2, const long *v2_in)
1544 {
1545  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1546  long *v2 = new long [k2];
1547  memcpy (v2, v2_in, sizeof (long) * k2);
1548  std::sort (v2, v2 + k2);
1549  { // de-dup v2
1550  long prev = -1;
1551  size_t wp = 0;
1552  for (size_t i = 0; i < k2; i++) {
1553  if (v2 [i] != prev) {
1554  v2[wp++] = prev = v2 [i];
1555  }
1556  }
1557  k2 = wp;
1558  }
1559  const long seen_flag = 1L << 60;
1560  size_t count = 0;
1561  for (size_t i = 0; i < k1; i++) {
1562  long q = v1 [i];
1563  size_t i0 = 0, i1 = k2;
1564  while (i0 + 1 < i1) {
1565  size_t imed = (i1 + i0) / 2;
1566  long piv = v2 [imed] & ~seen_flag;
1567  if (piv <= q) i0 = imed;
1568  else i1 = imed;
1569  }
1570  if (v2 [i0] == q) {
1571  count++;
1572  v2 [i0] |= seen_flag;
1573  }
1574  }
1575  delete [] v2;
1576 
1577  return count;
1578 }
1579 
1580 double imbalance_factor (int k, const int *hist) {
1581  double tot = 0, uf = 0;
1582 
1583  for (int i = 0 ; i < k ; i++) {
1584  tot += hist[i];
1585  uf += hist[i] * (double) hist[i];
1586  }
1587  uf = uf * k / (tot * tot);
1588 
1589  return uf;
1590 }
1591 
1592 
1593 double imbalance_factor (int n, int k, const long *assign) {
1594  std::vector<int> hist(k, 0);
1595  for (int i = 0; i < n; i++) {
1596  hist[assign[i]]++;
1597  }
1598 
1599  return imbalance_factor (k, hist.data());
1600 }
1601 
1602 
1603 
1604 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1605  memset (hist, 0, sizeof(hist[0]) * vmax);
1606  int nout = 0;
1607  while (n--) {
1608  if (v[n] < 0 || v[n] >= vmax) nout++;
1609  else hist[v[n]]++;
1610  }
1611  return nout;
1612 }
1613 
1614 
1615 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1616 {
1617  FAISS_THROW_IF_NOT (nbits % 8 == 0);
1618  size_t d = nbits / 8;
1619  std::vector<int> accu(d * 256);
1620  const uint8_t *c = codes;
1621  for (size_t i = 0; i < n; i++)
1622  for(int j = 0; j < d; j++)
1623  accu[j * 256 + *c++]++;
1624  memset (hist, 0, sizeof(*hist) * nbits);
1625  for (int i = 0; i < d; i++) {
1626  const int *ai = accu.data() + i * 256;
1627  int * hi = hist + i * 8;
1628  for (int j = 0; j < 256; j++)
1629  for (int k = 0; k < 8; k++)
1630  if ((j >> k) & 1)
1631  hi[k] += ai[j];
1632  }
1633 
1634 }
1635 
1636 
1637 
1638 size_t ivec_checksum (size_t n, const int *a)
1639 {
1640  size_t cs = 112909;
1641  while (n--) cs = cs * 65713 + a[n] * 1686049;
1642  return cs;
1643 }
1644 
1645 
1646 namespace {
1647  struct ArgsortComparator {
1648  const float *vals;
1649  bool operator() (const size_t a, const size_t b) const {
1650  return vals[a] < vals[b];
1651  }
1652  };
1653 
1654  struct SegmentS {
1655  size_t i0; // begin pointer in the permutation array
1656  size_t i1; // end
1657  size_t len() const {
1658  return i1 - i0;
1659  }
1660  };
1661 
1662  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1663  // extended to > 1 merge thread
1664 
1665  // merges 2 ranges that should be consecutive on the source into
1666  // the union of the two on the destination
1667  template<typename T>
1668  void parallel_merge (const T *src, T *dst,
1669  SegmentS &s1, SegmentS & s2, int nt,
1670  const ArgsortComparator & comp) {
1671  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1672  std::swap(s1, s2);
1673  }
1674 
1675  // compute sub-ranges for each thread
1676  SegmentS s1s[nt], s2s[nt], sws[nt];
1677  s2s[0].i0 = s2.i0;
1678  s2s[nt - 1].i1 = s2.i1;
1679 
1680  // not sure parallel actually helps here
1681 #pragma omp parallel for num_threads(nt)
1682  for (int t = 0; t < nt; t++) {
1683  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1684  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1685 
1686  if (t + 1 < nt) {
1687  T pivot = src[s1s[t].i1];
1688  size_t i0 = s2.i0, i1 = s2.i1;
1689  while (i0 + 1 < i1) {
1690  size_t imed = (i1 + i0) / 2;
1691  if (comp (pivot, src[imed])) {i1 = imed; }
1692  else {i0 = imed; }
1693  }
1694  s2s[t].i1 = s2s[t + 1].i0 = i1;
1695  }
1696  }
1697  s1.i0 = std::min(s1.i0, s2.i0);
1698  s1.i1 = std::max(s1.i1, s2.i1);
1699  s2 = s1;
1700  sws[0].i0 = s1.i0;
1701  for (int t = 0; t < nt; t++) {
1702  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1703  if (t + 1 < nt) {
1704  sws[t + 1].i0 = sws[t].i1;
1705  }
1706  }
1707  assert(sws[nt - 1].i1 == s1.i1);
1708 
1709  // do the actual merging
1710 #pragma omp parallel for num_threads(nt)
1711  for (int t = 0; t < nt; t++) {
1712  SegmentS sw = sws[t];
1713  SegmentS s1t = s1s[t];
1714  SegmentS s2t = s2s[t];
1715  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1716  for (;;) {
1717  // assert (sw.len() == s1t.len() + s2t.len());
1718  if (comp(src[s1t.i0], src[s2t.i0])) {
1719  dst[sw.i0++] = src[s1t.i0++];
1720  if (s1t.i0 == s1t.i1) break;
1721  } else {
1722  dst[sw.i0++] = src[s2t.i0++];
1723  if (s2t.i0 == s2t.i1) break;
1724  }
1725  }
1726  }
1727  if (s1t.len() > 0) {
1728  assert(s1t.len() == sw.len());
1729  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1730  } else if (s2t.len() > 0) {
1731  assert(s2t.len() == sw.len());
1732  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1733  }
1734  }
1735  }
1736 
1737 };
1738 
1739 void fvec_argsort (size_t n, const float *vals,
1740  size_t *perm)
1741 {
1742  for (size_t i = 0; i < n; i++) perm[i] = i;
1743  ArgsortComparator comp = {vals};
1744  std::sort (perm, perm + n, comp);
1745 }
1746 
1747 void fvec_argsort_parallel (size_t n, const float *vals,
1748  size_t *perm)
1749 {
1750  size_t * perm2 = new size_t[n];
1751  // 2 result tables, during merging, flip between them
1752  size_t *permB = perm2, *permA = perm;
1753 
1754  int nt = omp_get_max_threads();
1755  { // prepare correct permutation so that the result ends in perm
1756  // at final iteration
1757  int nseg = nt;
1758  while (nseg > 1) {
1759  nseg = (nseg + 1) / 2;
1760  std::swap (permA, permB);
1761  }
1762  }
1763 
1764 #pragma omp parallel
1765  for (size_t i = 0; i < n; i++) permA[i] = i;
1766 
1767  ArgsortComparator comp = {vals};
1768 
1769  SegmentS segs[nt];
1770 
1771  // independent sorts
1772 #pragma omp parallel for
1773  for (int t = 0; t < nt; t++) {
1774  size_t i0 = t * n / nt;
1775  size_t i1 = (t + 1) * n / nt;
1776  SegmentS seg = {i0, i1};
1777  std::sort (permA + seg.i0, permA + seg.i1, comp);
1778  segs[t] = seg;
1779  }
1780  int prev_nested = omp_get_nested();
1781  omp_set_nested(1);
1782 
1783  int nseg = nt;
1784  while (nseg > 1) {
1785  int nseg1 = (nseg + 1) / 2;
1786  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1787  int sub_nseg1 = nseg / 2;
1788 
1789 #pragma omp parallel for num_threads(nseg1)
1790  for (int s = 0; s < nseg; s += 2) {
1791  if (s + 1 == nseg) { // otherwise isolated segment
1792  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1793  segs[s].len() * sizeof(size_t));
1794  } else {
1795  int t0 = s * sub_nt / sub_nseg1;
1796  int t1 = (s + 1) * sub_nt / sub_nseg1;
1797  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1798  parallel_merge(permA, permB, segs[s], segs[s + 1],
1799  t1 - t0, comp);
1800  }
1801  }
1802  for (int s = 0; s < nseg; s += 2)
1803  segs[s / 2] = segs[s];
1804  nseg = nseg1;
1805  std::swap (permA, permB);
1806  }
1807  assert (permA == perm);
1808  omp_set_nested(prev_nested);
1809  delete [] perm2;
1810 }
1811 
1812 
1813 
1814 
1815 
1816 
1817 
1818 
1819 
1820 
1821 
1822 
1823 
1824 
1825 
1826 
1827 /***************************************************************************
1828  * heavily optimized table computations
1829  ***************************************************************************/
1830 
1831 
1832 static inline void fvec_madd_ref (size_t n, const float *a,
1833  float bf, const float *b, float *c) {
1834  for (size_t i = 0; i < n; i++)
1835  c[i] = a[i] + bf * b[i];
1836 }
1837 
1838 
1839 static inline void fvec_madd_sse (size_t n, const float *a,
1840  float bf, const float *b, float *c) {
1841  n >>= 2;
1842  __m128 bf4 = _mm_set_ps1 (bf);
1843  __m128 * a4 = (__m128*)a;
1844  __m128 * b4 = (__m128*)b;
1845  __m128 * c4 = (__m128*)c;
1846 
1847  while (n--) {
1848  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1849  b4++;
1850  a4++;
1851  c4++;
1852  }
1853 }
1854 
1855 void fvec_madd (size_t n, const float *a,
1856  float bf, const float *b, float *c)
1857 {
1858  if ((n & 3) == 0 &&
1859  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1860  fvec_madd_sse (n, a, bf, b, c);
1861  else
1862  fvec_madd_ref (n, a, bf, b, c);
1863 }
1864 
1865 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1866  float bf, const float *b, float *c) {
1867  float vmin = 1e20;
1868  int imin = -1;
1869 
1870  for (size_t i = 0; i < n; i++) {
1871  c[i] = a[i] + bf * b[i];
1872  if (c[i] < vmin) {
1873  vmin = c[i];
1874  imin = i;
1875  }
1876  }
1877  return imin;
1878 }
1879 
1880 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1881  float bf, const float *b, float *c) {
1882  n >>= 2;
1883  __m128 bf4 = _mm_set_ps1 (bf);
1884  __m128 vmin4 = _mm_set_ps1 (1e20);
1885  __m128i imin4 = _mm_set1_epi32 (-1);
1886  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1887  __m128i inc4 = _mm_set1_epi32 (4);
1888  __m128 * a4 = (__m128*)a;
1889  __m128 * b4 = (__m128*)b;
1890  __m128 * c4 = (__m128*)c;
1891 
1892  while (n--) {
1893  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1894  *c4 = vc4;
1895  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1896  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1897 
1898  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1899  _mm_andnot_si128 (mask, imin4));
1900  vmin4 = _mm_min_ps (vmin4, vc4);
1901  b4++;
1902  a4++;
1903  c4++;
1904  idx4 = _mm_add_epi32 (idx4, inc4);
1905  }
1906 
1907  // 4 values -> 2
1908  {
1909  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1910  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1911  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1912  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1913  _mm_andnot_si128 (mask, imin4));
1914  vmin4 = _mm_min_ps (vmin4, vc4);
1915  }
1916  // 2 values -> 1
1917  {
1918  idx4 = _mm_shuffle_epi32 (imin4, 1);
1919  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1920  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1921  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1922  _mm_andnot_si128 (mask, imin4));
1923  // vmin4 = _mm_min_ps (vmin4, vc4);
1924  }
1925  return _mm_extract_epi32 (imin4, 0);
1926 }
1927 
1928 
1929 int fvec_madd_and_argmin (size_t n, const float *a,
1930  float bf, const float *b, float *c)
1931 {
1932  if ((n & 3) == 0 &&
1933  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1934  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1935  else
1936  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1937 }
1938 
1939 
1940 
1942  size_t d, size_t *n, size_t nmax, const float *x,
1943  bool verbose, long seed)
1944 {
1945 
1946  if (*n <= nmax) return x; // nothing to do
1947 
1948  size_t n2 = nmax;
1949  if (verbose) {
1950  printf (" Input training set too big (max size is %ld), sampling "
1951  "%ld / %ld vectors\n", nmax, n2, *n);
1952  }
1953  std::vector<int> subset (*n);
1954  rand_perm (subset.data (), *n, seed);
1955  float *x_subset = new float[n2 * d];
1956  for (long i = 0; i < n2; i++)
1957  memcpy (&x_subset[i * d],
1958  &x[subset[i] * size_t(d)],
1959  sizeof (x[0]) * d);
1960  *n = n2;
1961  return x_subset;
1962 }
1963 
1964 
1965 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:48
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:955
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
Definition: utils.cpp:1369
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:481
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1615
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1941
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1461
float rand_float()
between 0 and 1
Definition: utils.cpp:207
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1855
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1604
long rand_long()
random long &lt; 2 ^ 62
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
Definition: utils.cpp:1477
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1542
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1311
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1249
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:915
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:70
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1580
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:538
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1234
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1289
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1638
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1929
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:935