Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 // -*- c++ -*-
12 
13 #include "utils.h"
14 
15 #include <cstdio>
16 #include <cassert>
17 #include <cstring>
18 #include <cmath>
19 #include <cmath>
20 
21 #include <smmintrin.h>
22 #include <mmintrin.h>
23 
24 #include <sys/time.h>
25 #include <sys/types.h>
26 #include <unistd.h>
27 
28 #include <omp.h>
29 
30 
31 #include <algorithm>
32 #include <vector>
33 
34 #include "AuxIndexStructures.h"
35 #include "FaissAssert.h"
36 
37 
38 
39 #ifndef FINTEGER
40 #define FINTEGER long
41 #endif
42 
43 
44 extern "C" {
45 
46 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
47 
48 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
49  n, FINTEGER *k, const float *alpha, const float *a,
50  FINTEGER *lda, const float *b, FINTEGER *
51  ldb, float *beta, float *c, FINTEGER *ldc);
52 
53 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
54 
55 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
56  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
57 
58 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
59  FINTEGER *lda, float *tau, float *work,
60  FINTEGER *lwork, FINTEGER *info);
61 
62 
63 }
64 
65 
66 /**************************************************
67  * Get some stats about the system
68  **************************************************/
69 
70 namespace faiss {
71 
72 double getmillisecs () {
73  struct timeval tv;
74  gettimeofday (&tv, nullptr);
75  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
76 }
77 
78 
79 #ifdef __linux__
80 
81 size_t get_mem_usage_kb ()
82 {
83  int pid = getpid ();
84  char fname[256];
85  snprintf (fname, 256, "/proc/%d/status", pid);
86  FILE * f = fopen (fname, "r");
87  FAISS_ASSERT (f || !"cannot open proc status file") ;
88  size_t sz = 0;
89  for (;;) {
90  char buf [256];
91  if (!fgets (buf, 256, f)) break;
92  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
93  }
94  fclose (f);
95  return sz;
96 }
97 
98 #elif __APPLE__
99 
100 size_t get_mem_usage_kb ()
101 {
102  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
103  return 0;
104 }
105 
106 #endif
107 
108 
109 
110 /**************************************************
111  * Random data generation functions
112  **************************************************/
113 
114 /**
115  * The definition of random functions depends on the architecture:
116  *
117  * - for Linux, we rely on re-entrant functions (random_r). This
118  * provides good quality reproducible random sequences.
119  *
120  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
121  * this function that it removed its definition form stdlib.h, so we
122  * re-declare it below. Fortunately, since it is deprecated, its
123  * prototype should not change much in the forerseeable future.
124  *
125  * Unfortunately, system designers are more concerned with making the
126  * most unpredictable random sequences for cryptographic use, when in
127  * scientific contexts what acutally matters is having reproducible
128  * squences in multi-threaded contexts.
129  */
130 
131 
132 #ifdef __linux__
133 
134 
135 
136 
138 {
139  int32_t a;
140  random_r (&rand_data, &a);
141  return a;
142 }
143 
145 {
146  int32_t a, b;
147  random_r (&rand_data, &a);
148  random_r (&rand_data, &b);
149  return long(a) | long(b) << 31;
150 }
151 
152 
154 {
155  memset (&rand_data, 0, sizeof (rand_data));
156  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
157 }
158 
159 
160 RandomGenerator::RandomGenerator (const RandomGenerator & other)
161 {
162  memcpy (rand_state, other.rand_state, sizeof(rand_state));
163  rand_data = other.rand_data;
164  setstate_r (rand_state, &rand_data);
165 }
166 
167 
168 #elif __APPLE__
169 
170 extern "C" {
171 int rand_r(unsigned *seed);
172 }
173 
175 {
176  rand_state = seed;
177 }
178 
179 
180 RandomGenerator::RandomGenerator (const RandomGenerator & other)
181 {
182  rand_state = other.rand_state;
183 }
184 
185 
187 {
188  // RAND_MAX is 31 bits
189  // try to add more randomness in the lower bits
190  int lowbits = rand_r(&rand_state) >> 15;
191  return rand_r(&rand_state) ^ lowbits;
192 }
193 
195 {
196  return long(random()) | long(random()) << 31;
197 }
198 
199 
200 
201 #endif
202 
204 { // this suffers form non-uniform probabilities when max is not a
205  // power of 2, but if RAND_MAX >> max the bias is limited.
206  return rand_int () % max;
207 }
208 
210 {
211  return rand_int() / float(1 << 31);
212 }
213 
214 double RandomGenerator::rand_double ()
215 {
216  return rand_long() / double(1L << 62);
217 }
218 
219 
220 /***********************************************************************
221  * Random functions in this C file only exist because Torch
222  * counterparts are slow and not multi-threaded. Typical use is for
223  * more than 1-100 billion values. */
224 
225 
226 /* Generate a set of random floating point values such that x[i] in [0,1]
227  multi-threading. For this reason, we rely on re-entreant functions. */
228 void float_rand (float * x, size_t n, long seed)
229 {
230  // only try to parallelize on large enough arrays
231  const size_t nblock = n < 1024 ? 1 : 1024;
232 
233  RandomGenerator rng0 (seed);
234  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
235 
236 #pragma omp parallel for
237  for (size_t j = 0; j < nblock; j++) {
238 
239  RandomGenerator rng (a0 + j * b0);
240 
241  const size_t istart = j * n / nblock;
242  const size_t iend = (j + 1) * n / nblock;
243 
244  for (size_t i = istart; i < iend; i++)
245  x[i] = rng.rand_float ();
246  }
247 }
248 
249 
250 void float_randn (float * x, size_t n, long seed)
251 {
252  // only try to parallelize on large enough arrays
253  const size_t nblock = n < 1024 ? 1 : 1024;
254 
255  RandomGenerator rng0 (seed);
256  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
257 
258 #pragma omp parallel for
259  for (size_t j = 0; j < nblock; j++) {
260  RandomGenerator rng (a0 + j * b0);
261 
262  double a = 0, b = 0, s = 0;
263  int state = 0; /* generate two number per "do-while" loop */
264 
265  const size_t istart = j * n / nblock;
266  const size_t iend = (j + 1) * n / nblock;
267 
268  for (size_t i = istart; i < iend; i++) {
269  /* Marsaglia's method (see Knuth) */
270  if (state == 0) {
271  do {
272  a = 2.0 * rng.rand_double () - 1;
273  b = 2.0 * rng.rand_double () - 1;
274  s = a * a + b * b;
275  } while (s >= 1.0);
276  x[i] = a * sqrt(-2.0 * log(s) / s);
277  }
278  else
279  x[i] = b * sqrt(-2.0 * log(s) / s);
280  state = 1 - state;
281  }
282  }
283 }
284 
285 
286 /* Integer versions */
287 void long_rand (long * x, size_t n, long seed)
288 {
289  // only try to parallelize on large enough arrays
290  const size_t nblock = n < 1024 ? 1 : 1024;
291 
292  RandomGenerator rng0 (seed);
293  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
294 
295 #pragma omp parallel for
296  for (size_t j = 0; j < nblock; j++) {
297 
298  RandomGenerator rng (a0 + j * b0);
299 
300  const size_t istart = j * n / nblock;
301  const size_t iend = (j + 1) * n / nblock;
302  for (size_t i = istart; i < iend; i++)
303  x[i] = rng.rand_long ();
304  }
305 }
306 
307 
308 
309 void rand_perm (int *perm, size_t n, long seed)
310 {
311  for (size_t i = 0; i < n; i++) perm[i] = i;
312 
313  RandomGenerator rng (seed);
314 
315  for (size_t i = 0; i + 1 < n; i++) {
316  int i2 = i + rng.rand_int (n - i);
317  std::swap(perm[i], perm[i2]);
318  }
319 }
320 
321 
322 
323 
324 void byte_rand (uint8_t * x, size_t n, long seed)
325 {
326  // only try to parallelize on large enough arrays
327  const size_t nblock = n < 1024 ? 1 : 1024;
328 
329  RandomGenerator rng0 (seed);
330  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
331 
332 #pragma omp parallel for
333  for (size_t j = 0; j < nblock; j++) {
334 
335  RandomGenerator rng (a0 + j * b0);
336 
337  const size_t istart = j * n / nblock;
338  const size_t iend = (j + 1) * n / nblock;
339 
340  size_t i;
341  for (i = istart; i < iend; i++)
342  x[i] = rng.rand_long ();
343  }
344 }
345 
346 
347 
348 void reflection (const float * __restrict u,
349  float * __restrict x,
350  size_t n, size_t d, size_t nu)
351 {
352  size_t i, j, l;
353  for (i = 0; i < n; i++) {
354  const float * up = u;
355  for (l = 0; l < nu; l++) {
356  float ip1 = 0, ip2 = 0;
357 
358  for (j = 0; j < d; j+=2) {
359  ip1 += up[j] * x[j];
360  ip2 += up[j+1] * x[j+1];
361  }
362  float ip = 2 * (ip1 + ip2);
363 
364  for (j = 0; j < d; j++)
365  x[j] -= ip * up[j];
366  up += d;
367  }
368  x += d;
369  }
370 }
371 
372 
373 /* Reference implementation (slower) */
374 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
375 {
376  size_t i, j, l;
377  for (i = 0; i < n; i++) {
378  const float * up = u;
379  for (l = 0; l < nu; l++) {
380  double ip = 0;
381 
382  for (j = 0; j < d; j++)
383  ip += up[j] * x[j];
384  ip *= 2;
385 
386  for (j = 0; j < d; j++)
387  x[j] -= ip * up[j];
388 
389  up += d;
390  }
391  x += d;
392  }
393 }
394 
395 /*********************************************************
396  * Optimized distance computations
397  *********************************************************/
398 
399 
400 
401 /* Functions to compute:
402  - L2 distance between 2 vectors
403  - inner product between 2 vectors
404  - L2 norm of a vector
405 
406  The functions should probably not be invoked when a large number of
407  vectors are be processed in batch (in which case Matrix multiply
408  is faster), but may be useful for comparing vectors isolated in
409  memory.
410 
411  Works with any vectors of any dimension, even unaligned (in which
412  case they are slower).
413 
414 */
415 
416 
417 // set some of the floats in x to 0 so that there are d floats remaining
418 // 0 <= d <= 4
419 static inline __m128 maskout (int d, __m128 x)
420 {
421  // check which values to clear out
422  __m128i d4 = _mm_set1_epi32 (d);
423  __m128i lim = _mm_set_epi32 (3, 2, 1, 0);
424  __m128i mask = _mm_cmpgt_epi32 (d4, lim);
425 
426  return _mm_and_ps (x, (__m128)mask);
427 }
428 
429 
430 /* SSE-implementation of L2 distance */
431 float fvec_L2sqr (const float * x,
432  const float * y,
433  size_t d)
434 {
435  __m128 mx, my;
436  __m128 msum1 = _mm_setzero_ps();
437 
438  while (d > 4) {
439  mx = _mm_loadu_ps (x); x += 4;
440  my = _mm_loadu_ps (y); y += 4;
441  const __m128 a_m_b1 = _mm_sub_ps (mx, my);
442  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
443  d -= 4;
444  }
445 
446  // add the last 1, 2, 3, or 4 values
447  mx = _mm_loadu_ps (x);
448  my = _mm_loadu_ps (y);
449  __m128 a_m_b1 = maskout (d, _mm_sub_ps (mx, my));
450 
451  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
452 
453  msum1 = _mm_hadd_ps (msum1, msum1);
454  msum1 = _mm_hadd_ps (msum1, msum1);
455  return _mm_cvtss_f32 (msum1);
456 }
457 
458 
459 /* same without SSE */
460 float fvec_L2sqr_ref (const float * x,
461  const float * y,
462  size_t d)
463 {
464  size_t i;
465  float res_ = 0;
466  for (i = 0; i < d; i++) {
467  const float tmp = x[i] - y[i];
468  res_ += tmp * tmp;
469  }
470  return res_;
471 }
472 
473 float fvec_inner_product (const float * x,
474  const float * y,
475  size_t d)
476 {
477  __m128 mx, my;
478  __m128 msum1 = _mm_setzero_ps();
479 
480  while (d > 4) {
481  mx = _mm_loadu_ps (x); x += 4;
482  my = _mm_loadu_ps (y); y += 4;
483  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
484  d -= 4;
485  }
486 
487  // add the last 1, 2, 3, or 4 values
488  mx = _mm_loadu_ps (x);
489  my = _mm_loadu_ps (y);
490  __m128 prod = maskout (d, _mm_mul_ps (mx, my));
491 
492  msum1 = _mm_add_ps (msum1, prod);
493 
494  msum1 = _mm_hadd_ps (msum1, msum1);
495  msum1 = _mm_hadd_ps (msum1, msum1);
496  return _mm_cvtss_f32 (msum1);
497 }
498 
499 
500 float fvec_inner_product_ref (const float * x,
501  const float * y,
502  size_t d)
503 {
504  size_t i;
505  float res_ = 0;
506  for (i = 0; i < d; i++)
507  res_ += x[i] * y[i];
508  return res_;
509 }
510 
511 
512 float fvec_norm_L2sqr (const float * x,
513  size_t d)
514 {
515  __m128 mx;
516  __m128 msum1 = _mm_setzero_ps();
517 
518  while (d > 4) {
519  mx = _mm_loadu_ps (x); x += 4;
520  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
521  d -= 4;
522  }
523 
524  mx = maskout (d, _mm_loadu_ps (x));
525  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
526 
527  msum1 = _mm_hadd_ps (msum1, msum1);
528  msum1 = _mm_hadd_ps (msum1, msum1);
529  return _mm_cvtss_f32 (msum1);
530 }
531 
532 
533 float fvec_norm_L2sqr_ref (const float * __restrict x,
534  size_t d)
535 {
536  size_t i;
537  double res_ = 0;
538  for (i = 0; i < d; i++)
539  res_ += x[i] * x[i];
540  return res_;
541 }
542 
543 
544 
545 
546 /* Compute the inner product between a vector x and
547  a set of ny vectors y.
548  These functions are not intended to replace BLAS matrix-matrix, as they
549  would be significantly less efficient in this case. */
550 void fvec_inner_products_ny (float * __restrict ip,
551  const float * x,
552  const float * y,
553  size_t d, size_t ny)
554 {
555  for (size_t i = 0; i < ny; i++) {
556  ip[i] = fvec_inner_product (x, y, d);
557  y += d;
558  }
559 }
560 
561 
562 
563 
564 /* compute ny L2 distances between x and a set of vectors y */
565 void fvec_L2sqr_ny (float * __restrict dis,
566  const float * x,
567  const float * y,
568  size_t d, size_t ny)
569 {
570  for (size_t i = 0; i < ny; i++) {
571  dis[i] = fvec_L2sqr (x, y, d);
572  y += d;
573  }
574 }
575 
576 
577 
578 
579 /* Compute the L2 norm of a set of nx vectors */
580 void fvec_norms_L2 (float * __restrict nr,
581  const float * __restrict x,
582  size_t d, size_t nx)
583 {
584 
585 #pragma omp parallel for
586  for (size_t i = 0; i < nx; i++) {
587  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
588  }
589 }
590 
591 void fvec_norms_L2sqr (float * __restrict nr,
592  const float * __restrict x,
593  size_t d, size_t nx)
594 {
595 #pragma omp parallel for
596  for (size_t i = 0; i < nx; i++)
597  nr[i] = fvec_norm_L2sqr (x + i * d, d);
598 }
599 
600 
601 
602 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
603 {
604 #pragma omp parallel for
605  for (size_t i = 0; i < nx; i++) {
606  float * __restrict xi = x + i * d;
607 
608  float nr = fvec_norm_L2sqr (xi, d);
609 
610  if (nr > 0) {
611  size_t j;
612  const float inv_nr = 1.0 / sqrtf (nr);
613  for (j = 0; j < d; j++)
614  xi[j] *= inv_nr;
615  }
616  }
617 }
618 
619 
620 
621 
622 
623 
624 
625 
626 
627 
628 
629 
630 
631 
632 
633 
634 
635 /***************************************************************************
636  * KNN functions
637  ***************************************************************************/
638 
639 
640 
641 /* Find the nearest neighbors for nx queries in a set of ny vectors */
642 static void knn_inner_product_sse (const float * x,
643  const float * y,
644  size_t d, size_t nx, size_t ny,
645  float_minheap_array_t * res)
646 {
647  size_t k = res->k;
648 
649 #pragma omp parallel for
650  for (size_t i = 0; i < nx; i++) {
651  const float * x_ = x + i * d;
652  const float * y_ = y;
653 
654  float * __restrict simi = res->get_val(i);
655  long * __restrict idxi = res->get_ids (i);
656 
657  minheap_heapify (k, simi, idxi);
658 
659  for (size_t j = 0; j < ny; j++) {
660  float ip = fvec_inner_product (x_, y_, d);
661 
662  if (ip > simi[0]) {
663  minheap_pop (k, simi, idxi);
664  minheap_push (k, simi, idxi, ip, j);
665  }
666  y_ += d;
667  }
668  minheap_reorder (k, simi, idxi);
669  }
670 
671 }
672 
673 static void knn_L2sqr_sse (
674  const float * x,
675  const float * y,
676  size_t d, size_t nx, size_t ny,
677  float_maxheap_array_t * res)
678 {
679  size_t k = res->k;
680 
681 #pragma omp parallel for
682  for (size_t i = 0; i < nx; i++) {
683  const float * x_ = x + i * d;
684  const float * y_ = y;
685  size_t j;
686  float * __restrict simi = res->get_val(i);
687  long * __restrict idxi = res->get_ids (i);
688 
689  maxheap_heapify (k, simi, idxi);
690  for (j = 0; j < ny; j++) {
691  float disij = fvec_L2sqr (x_, y_, d);
692 
693  if (disij < simi[0]) {
694  maxheap_pop (k, simi, idxi);
695  maxheap_push (k, simi, idxi, disij, j);
696  }
697  y_ += d;
698  }
699  maxheap_reorder (k, simi, idxi);
700  }
701 
702 }
703 
704 
705 /** Find the nearest neighbors for nx queries in a set of ny vectors */
706 static void knn_inner_product_blas (
707  const float * x,
708  const float * y,
709  size_t d, size_t nx, size_t ny,
710  float_minheap_array_t * res)
711 {
712  res->heapify ();
713 
714  // BLAS does not like empty matrices
715  if (nx == 0 || ny == 0) return;
716 
717  /* block sizes */
718  const size_t bs_x = 4096, bs_y = 1024;
719  // const size_t bs_x = 16, bs_y = 16;
720  float *ip_block = new float[bs_x * bs_y];
721 
722  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
723  size_t i1 = i0 + bs_x;
724  if(i1 > nx) i1 = nx;
725 
726  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
727  size_t j1 = j0 + bs_y;
728  if (j1 > ny) j1 = ny;
729  /* compute the actual dot products */
730  {
731  float one = 1, zero = 0;
732  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
733  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
734  y + j0 * d, &di,
735  x + i0 * d, &di, &zero,
736  ip_block, &nyi);
737  }
738 
739  /* collect maxima */
740  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
741  }
742  }
743  delete [] ip_block;
744  res->reorder ();
745 }
746 
747 // distance correction is an operator that can be applied to transform
748 // the distances
749 template<class DistanceCorrection>
750 static void knn_L2sqr_blas (const float * x,
751  const float * y,
752  size_t d, size_t nx, size_t ny,
753  float_maxheap_array_t * res,
754  const DistanceCorrection &corr)
755 {
756  res->heapify ();
757 
758  // BLAS does not like empty matrices
759  if (nx == 0 || ny == 0) return;
760 
761  size_t k = res->k;
762 
763  /* block sizes */
764  const size_t bs_x = 4096, bs_y = 1024;
765  // const size_t bs_x = 16, bs_y = 16;
766  float *ip_block = new float[bs_x * bs_y];
767 
768  float *x_norms = new float[nx];
769  fvec_norms_L2sqr (x_norms, x, d, nx);
770 
771  float *y_norms = new float[ny];
772  fvec_norms_L2sqr (y_norms, y, d, ny);
773 
774  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
775  size_t i1 = i0 + bs_x;
776  if(i1 > nx) i1 = nx;
777 
778  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
779  size_t j1 = j0 + bs_y;
780  if (j1 > ny) j1 = ny;
781  /* compute the actual dot products */
782  {
783  float one = 1, zero = 0;
784  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
785  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
786  y + j0 * d, &di,
787  x + i0 * d, &di, &zero,
788  ip_block, &nyi);
789  }
790 
791  /* collect minima */
792 #pragma omp parallel for
793  for (size_t i = i0; i < i1; i++) {
794  float * __restrict simi = res->get_val(i);
795  long * __restrict idxi = res->get_ids (i);
796  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
797 
798  for (size_t j = j0; j < j1; j++) {
799  float ip = *ip_line++;
800  float dis = x_norms[i] + y_norms[j] - 2 * ip;
801 
802  dis = corr (dis, i, j);
803 
804  if (dis < simi[0]) {
805  maxheap_pop (k, simi, idxi);
806  maxheap_push (k, simi, idxi, dis, j);
807  }
808  }
809  }
810  }
811  }
812  res->reorder ();
813 
814  delete [] ip_block;
815  delete [] x_norms;
816  delete [] y_norms;
817 }
818 
819 
820 
821 
822 
823 
824 
825 
826 
827 /*******************************************************
828  * KNN driver functions
829  *******************************************************/
830 
831 void knn_inner_product (const float * x,
832  const float * y,
833  size_t d, size_t nx, size_t ny,
834  float_minheap_array_t * res)
835 {
836  if (d % 4 == 0 && nx < 20) {
837  knn_inner_product_sse (x, y, d, nx, ny, res);
838  } else {
839  knn_inner_product_blas (x, y, d, nx, ny, res);
840  }
841 }
842 
843 
844 
846  float operator()(float dis, size_t qno, size_t bno) const {
847  return dis;
848  }
849 };
850 
851 void knn_L2sqr (const float * x,
852  const float * y,
853  size_t d, size_t nx, size_t ny,
854  float_maxheap_array_t * res)
855 {
856  if (d % 4 == 0 && nx < 20) {
857  knn_L2sqr_sse (x, y, d, nx, ny, res);
858  } else {
860  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
861  }
862 }
863 
865  const float *base_shift;
866  float operator()(float dis, size_t qno, size_t bno) const {
867  return dis - base_shift[bno];
868  }
869 };
870 
872  const float * x,
873  const float * y,
874  size_t d, size_t nx, size_t ny,
875  float_maxheap_array_t * res,
876  const float *base_shift)
877 {
878  BaseShiftDistanceCorrection corr = {base_shift};
879  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
880 }
881 
882 
883 
884 /***************************************************************************
885  * compute a subset of distances
886  ***************************************************************************/
887 
888 /* compute the inner product between x and a subset y of ny vectors,
889  whose indices are given by idy. */
890 void fvec_inner_products_by_idx (float * __restrict ip,
891  const float * x,
892  const float * y,
893  const long * __restrict ids, /* for y vecs */
894  size_t d, size_t nx, size_t ny)
895 {
896 #pragma omp parallel for
897  for (size_t j = 0; j < nx; j++) {
898  const long * __restrict idsj = ids + j * ny;
899  const float * xj = x + j * d;
900  float * __restrict ipj = ip + j * ny;
901  for (size_t i = 0; i < ny; i++) {
902  if (idsj[i] < 0)
903  continue;
904  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
905  }
906  }
907 }
908 
909 /* compute the inner product between x and a subset y of ny vectors,
910  whose indices are given by idy. */
911 void fvec_L2sqr_by_idx (float * __restrict dis,
912  const float * x,
913  const float * y,
914  const long * __restrict ids, /* ids of y vecs */
915  size_t d, size_t nx, size_t ny)
916 {
917 #pragma omp parallel for
918  for (size_t j = 0; j < nx; j++) {
919  const long * __restrict idsj = ids + j * ny;
920  const float * xj = x + j * d;
921  float * __restrict disj = dis + j * ny;
922  for (size_t i = 0; i < ny; i++) {
923  if (idsj[i] < 0)
924  continue;
925  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
926  }
927  }
928 }
929 
930 
931 
932 
933 
934 /* Find the nearest neighbors for nx queries in a set of ny vectors
935  indexed by ids. May be useful for re-ranking a pre-selected vector list */
936 void knn_inner_products_by_idx (const float * x,
937  const float * y,
938  const long * __restrict ids,
939  size_t d, size_t nx, size_t ny,
940  float_minheap_array_t * res)
941 {
942  size_t k = res->k;
943 
944 
945 
946 #pragma omp parallel for
947  for (size_t i = 0; i < nx; i++) {
948  const float * x_ = x + i * d;
949  const long * idsi = ids + i * ny;
950  size_t j;
951  float * __restrict simi = res->get_val(i);
952  long * __restrict idxi = res->get_ids (i);
953  minheap_heapify (k, simi, idxi);
954 
955  for (j = 0; j < ny; j++) {
956  if (idsi[j] < 0) break;
957  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
958 
959  if (ip > simi[0]) {
960  minheap_pop (k, simi, idxi);
961  minheap_push (k, simi, idxi, ip, idsi[j]);
962  }
963  }
964  minheap_reorder (k, simi, idxi);
965  }
966 
967 }
968 
969 void knn_L2sqr_by_idx (const float * x,
970  const float * y,
971  const long * __restrict ids,
972  size_t d, size_t nx, size_t ny,
973  float_maxheap_array_t * res)
974 {
975  size_t k = res->k;
976 
977 #pragma omp parallel for
978  for (size_t i = 0; i < nx; i++) {
979  const float * x_ = x + i * d;
980  const long * __restrict idsi = ids + i * ny;
981  float * __restrict simi = res->get_val(i);
982  long * __restrict idxi = res->get_ids (i);
983  maxheap_heapify (res->k, simi, idxi);
984  for (size_t j = 0; j < ny; j++) {
985  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
986 
987  if (disij < simi[0]) {
988  maxheap_pop (k, simi, idxi);
989  maxheap_push (k, simi, idxi, disij, idsi[j]);
990  }
991  }
992  maxheap_reorder (res->k, simi, idxi);
993  }
994 
995 }
996 
997 
998 
999 
1000 
1001 /***************************************************************************
1002  * Range search
1003  ***************************************************************************/
1004 
1005 /** Find the nearest neighbors for nx queries in a set of ny vectors
1006  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1007  */
1008 template <bool compute_l2>
1009 static void range_search_blas (
1010  const float * x,
1011  const float * y,
1012  size_t d, size_t nx, size_t ny,
1013  float radius,
1014  RangeSearchResult *result)
1015 {
1016 
1017  // BLAS does not like empty matrices
1018  if (nx == 0 || ny == 0) return;
1019 
1020  /* block sizes */
1021  const size_t bs_x = 4096, bs_y = 1024;
1022  // const size_t bs_x = 16, bs_y = 16;
1023  float *ip_block = new float[bs_x * bs_y];
1024 
1025  float *x_norms = nullptr, *y_norms = nullptr;
1026 
1027  if (compute_l2) {
1028  x_norms = new float[nx];
1029  fvec_norms_L2sqr (x_norms, x, d, nx);
1030  y_norms = new float[ny];
1031  fvec_norms_L2sqr (y_norms, y, d, ny);
1032  }
1033 
1034  std::vector <RangeSearchPartialResult *> partial_results;
1035 
1036  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1037  size_t j1 = j0 + bs_y;
1038  if (j1 > ny) j1 = ny;
1039  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1040  partial_results.push_back (pres);
1041 
1042  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1043  size_t i1 = i0 + bs_x;
1044  if(i1 > nx) i1 = nx;
1045 
1046  /* compute the actual dot products */
1047  {
1048  float one = 1, zero = 0;
1049  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1050  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1051  y + j0 * d, &di,
1052  x + i0 * d, &di, &zero,
1053  ip_block, &nyi);
1054  }
1055 
1056 
1057  for (size_t i = i0; i < i1; i++) {
1058  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1059 
1060  RangeSearchPartialResult::QueryResult & qres =
1061  pres->new_result (i);
1062 
1063  for (size_t j = j0; j < j1; j++) {
1064  float ip = *ip_line++;
1065  if (compute_l2) {
1066  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1067  if (dis < radius) {
1068  qres.add (dis, j);
1069  }
1070  } else {
1071  if (ip > radius) {
1072  qres.add (ip, j);
1073  }
1074  }
1075  }
1076  }
1077  }
1078 
1079  }
1080  delete [] ip_block;
1081  delete [] x_norms;
1082  delete [] y_norms;
1083 
1084  { // merge the partial results
1085  int npres = partial_results.size();
1086  // count
1087  for (size_t i = 0; i < nx; i++) {
1088  for (int j = 0; j < npres; j++)
1089  result->lims[i] += partial_results[j]->queries[i].nres;
1090  }
1091  result->do_allocation ();
1092  for (int j = 0; j < npres; j++) {
1093  partial_results[j]->set_result (true);
1094  delete partial_results[j];
1095  }
1096 
1097  // reset the limits
1098  for (size_t i = nx; i > 0; i--) {
1099  result->lims [i] = result->lims [i - 1];
1100  }
1101  result->lims [0] = 0;
1102  }
1103 }
1104 
1105 
1106 template <bool compute_l2>
1107 static void range_search_sse (const float * x,
1108  const float * y,
1109  size_t d, size_t nx, size_t ny,
1110  float radius,
1111  RangeSearchResult *res)
1112 {
1113  FAISS_ASSERT (d % 4 == 0);
1114 
1115 #pragma omp parallel
1116  {
1117  RangeSearchPartialResult pres (res);
1118 
1119 #pragma omp for
1120  for (size_t i = 0; i < nx; i++) {
1121  const float * x_ = x + i * d;
1122  const float * y_ = y;
1123  size_t j;
1124 
1125  RangeSearchPartialResult::QueryResult & qres =
1126  pres.new_result (i);
1127 
1128  for (j = 0; j < ny; j++) {
1129  if (compute_l2) {
1130  float disij = fvec_L2sqr (x_, y_, d);
1131  if (disij < radius) {
1132  qres.add (disij, j);
1133  }
1134  } else {
1135  float ip = fvec_inner_product (x_, y_, d);
1136  if (ip > radius) {
1137  qres.add (ip, j);
1138  }
1139  }
1140  y_ += d;
1141  }
1142 
1143  }
1144  pres.finalize ();
1145  }
1146 }
1147 
1148 
1149 
1150 
1151 
1153  const float * x,
1154  const float * y,
1155  size_t d, size_t nx, size_t ny,
1156  float radius,
1157  RangeSearchResult *res)
1158 {
1159 
1160  if (d % 4 == 0 && nx < 20) {
1161  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1162  } else {
1163  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1164  }
1165 }
1166 
1168  const float * x,
1169  const float * y,
1170  size_t d, size_t nx, size_t ny,
1171  float radius,
1172  RangeSearchResult *res)
1173 {
1174 
1175  if (d % 4 == 0 && nx < 20) {
1176  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1177  } else {
1178  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1179  }
1180 }
1181 
1182 
1183 
1184 /***************************************************************************
1185  * Some matrix manipulation functions
1186  ***************************************************************************/
1187 
1188 
1189 /* This function exists because the Torch counterpart is extremly slow
1190  (not multi-threaded + unexpected overhead even in single thread).
1191  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1192 void inner_product_to_L2sqr (float * __restrict dis,
1193  const float * nr1,
1194  const float * nr2,
1195  size_t n1, size_t n2)
1196 {
1197 
1198 #pragma omp parallel for
1199  for (size_t j = 0 ; j < n1 ; j++) {
1200  float * disj = dis + j * n2;
1201  for (size_t i = 0 ; i < n2 ; i++)
1202  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1203  }
1204 }
1205 
1206 
1207 void matrix_qr (int m, int n, float *a)
1208 {
1209  FAISS_ASSERT(m >= n);
1210  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1211  std::vector<float> tau (ki);
1212  FINTEGER lwork = -1, info;
1213  float work_size;
1214 
1215  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1216  &work_size, &lwork, &info);
1217  lwork = size_t(work_size);
1218  std::vector<float> work (lwork);
1219 
1220  sgeqrf_ (&mi, &ni, a, &mi,
1221  tau.data(), work.data(), &lwork, &info);
1222 
1223  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1224  work.data(), &lwork, &info);
1225 
1226 }
1227 
1228 
1229 void pairwise_L2sqr (long d,
1230  long nq, const float *xq,
1231  long nb, const float *xb,
1232  float *dis,
1233  long ldq, long ldb, long ldd)
1234 {
1235  if (nq == 0 || nb == 0) return;
1236  if (ldq == -1) ldq = d;
1237  if (ldb == -1) ldb = d;
1238  if (ldd == -1) ldd = nb;
1239 
1240  // store in beginning of distance matrix to avoid malloc
1241  float *b_norms = dis;
1242 
1243 #pragma omp parallel for
1244  for (long i = 0; i < nb; i++)
1245  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1246 
1247 #pragma omp parallel for
1248  for (long i = 1; i < nq; i++) {
1249  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1250  for (long j = 0; j < nb; j++)
1251  dis[i * ldd + j] = q_norm + b_norms [j];
1252  }
1253 
1254  {
1255  float q_norm = fvec_norm_L2sqr (xq, d);
1256  for (long j = 0; j < nb; j++)
1257  dis[j] += q_norm;
1258  }
1259 
1260  {
1261  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1262  float one = 1.0, minus_2 = -2.0;
1263 
1264  sgemm_ ("Transposed", "Not transposed",
1265  &nbi, &nqi, &di,
1266  &minus_2,
1267  xb, &ldbi,
1268  xq, &ldqi,
1269  &one, dis, &lddi);
1270  }
1271 
1272 
1273 }
1274 
1275 
1276 
1277 /***************************************************************************
1278  * Kmeans subroutine
1279  ***************************************************************************/
1280 
1281 #define EPS 1e-7
1282 
1283 /* For k-means, compute centroids given assignment of vectors to centroids */
1284 /* NOTE: This could be multi-threaded (use histogram of indexes) */
1285 int km_update_centroids (const float * x,
1286  float * centroids,
1287  long * assign,
1288  size_t d, size_t k, size_t n)
1289 {
1290  std::vector<size_t> hassign(k);
1291  memset (centroids, 0, sizeof(*centroids) * d * k);
1292 
1293 
1294 #pragma omp parallel
1295  {
1296  int nt = omp_get_num_threads();
1297  int rank = omp_get_thread_num();
1298  // this thread is taking care of centroids c0:c1
1299  size_t c0 = (k * rank) / nt;
1300  size_t c1 = (k * (rank + 1)) / nt;
1301  const float *xi = x;
1302  // printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
1303  size_t nacc = 0;
1304 
1305  for (size_t i = 0; i < n; i++) {
1306  long ci = assign[i];
1307  assert (ci >= 0 && ci < k);
1308  if (ci >= c0 && ci < c1) {
1309  float * c = centroids + ci * d;
1310  hassign[ci]++;
1311  for (size_t j = 0; j < d; j++)
1312  c[j] += xi[j];
1313  nacc++;
1314  }
1315  xi += d;
1316  }
1317  // printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
1318 
1319  }
1320 
1321 #pragma omp parallel for
1322  for (size_t ci = 0; ci < k; ci++) {
1323  float * c = centroids + ci * d;
1324  float ni = (float) hassign[ci];
1325  if (ni != 0) {
1326  for (size_t j = 0; j < d; j++)
1327  c[j] /= ni;
1328  }
1329  }
1330 
1331  /* Take care of void clusters */
1332  size_t nsplit = 0;
1333  RandomGenerator rng (1234);
1334  for (size_t ci = 0; ci < k; ci++) {
1335  if (hassign[ci] == 0) { /* need to redefine a centroid */
1336  size_t cj;
1337  for (cj = 0; 1; cj = (cj+1) % k) {
1338  /* probability to pick this cluster for split */
1339  float p = (hassign[cj] - 1.0) / (float) (n - k);
1340  float r = rng.rand_float ();
1341  if (r < p) {
1342  break; /* found our cluster to be split */
1343  }
1344  }
1345  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1346 
1347  /* small symmetric pertubation. Much better than */
1348  for (size_t j = 0; j < d; j++)
1349  if (j % 2 == 0) {
1350  centroids[ci * d + j] += EPS;
1351  centroids[cj * d + j] -= EPS;
1352  }
1353  else {
1354  centroids[ci * d + j] -= EPS;
1355  centroids[cj * d + j] += EPS;
1356  }
1357 
1358  /* assume even split of the cluster */
1359  hassign[ci] = hassign[cj] / 2;
1360  hassign[cj] -= hassign[ci];
1361  nsplit++;
1362  }
1363  }
1364 
1365  return nsplit;
1366 }
1367 
1368 #undef EPS
1369 
1370 
1371 
1372 /***************************************************************************
1373  * Result list routines
1374  ***************************************************************************/
1375 
1376 
1377 void ranklist_handle_ties (int k, long *idx, const float *dis)
1378 {
1379  float prev_dis = -1e38;
1380  int prev_i = -1;
1381  for (int i = 0; i < k; i++) {
1382  if (dis[i] != prev_dis) {
1383  if (i > prev_i + 1) {
1384  // sort between prev_i and i - 1
1385  std::sort (idx + prev_i, idx + i);
1386  }
1387  prev_i = i;
1388  prev_dis = dis[i];
1389  }
1390  }
1391 }
1392 
1393 size_t ranklist_intersection_size (size_t k1, const long *v1,
1394  size_t k2, const long *v2_in)
1395 {
1396  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1397  long *v2 = new long [k2];
1398  memcpy (v2, v2_in, sizeof (long) * k2);
1399  std::sort (v2, v2 + k2);
1400  { // de-dup v2
1401  long prev = -1;
1402  size_t wp = 0;
1403  for (size_t i = 0; i < k2; i++) {
1404  if (v2 [i] != prev) {
1405  v2[wp++] = prev = v2 [i];
1406  }
1407  }
1408  k2 = wp;
1409  }
1410  const long seen_flag = 1L << 60;
1411  size_t count = 0;
1412  for (size_t i = 0; i < k1; i++) {
1413  long q = v1 [i];
1414  size_t i0 = 0, i1 = k2;
1415  while (i0 + 1 < i1) {
1416  size_t imed = (i1 + i0) / 2;
1417  long piv = v2 [imed] & ~seen_flag;
1418  if (piv <= q) i0 = imed;
1419  else i1 = imed;
1420  }
1421  if (v2 [i0] == q) {
1422  count++;
1423  v2 [i0] |= seen_flag;
1424  }
1425  }
1426  delete [] v2;
1427 
1428  return count;
1429 }
1430 
1431 double imbalance_factor (int k, const int *hist) {
1432  double tot = 0, uf = 0;
1433 
1434  for (int i = 0 ; i < k ; i++) {
1435  tot += hist[i];
1436  uf += hist[i] * (double) hist[i];
1437  }
1438  uf = uf * k / (tot * tot);
1439 
1440  return uf;
1441 }
1442 
1443 
1444 double imbalance_factor (int n, int k, const long *assign) {
1445  std::vector<int> hist(k, 0);
1446  for (int i = 0; i < n; i++) {
1447  hist[assign[i]]++;
1448  }
1449 
1450  return imbalance_factor (k, hist.data());
1451 }
1452 
1453 
1454 
1455 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1456  memset (hist, 0, sizeof(hist[0]) * vmax);
1457  int nout = 0;
1458  while (n--) {
1459  if (v[n] < 0 || v[n] >= vmax) nout++;
1460  else hist[v[n]]++;
1461  }
1462  return nout;
1463 }
1464 
1465 
1466 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1467 {
1468  FAISS_ASSERT (nbits % 8 == 0);
1469  size_t d = nbits / 8;
1470  std::vector<int> accu(d * 256);
1471  const uint8_t *c = codes;
1472  for (size_t i = 0; i < n; i++)
1473  for(int j = 0; j < d; j++)
1474  accu[j * 256 + *c++]++;
1475  memset (hist, 0, sizeof(*hist) * nbits);
1476  for (int i = 0; i < d; i++) {
1477  const int *ai = accu.data() + i * 256;
1478  int * hi = hist + i * 8;
1479  for (int j = 0; j < 256; j++)
1480  for (int k = 0; k < 8; k++)
1481  if ((j >> k) & 1)
1482  hi[k] += ai[j];
1483  }
1484 
1485 }
1486 
1487 
1488 
1489 size_t ivec_checksum (size_t n, const int *a)
1490 {
1491  size_t cs = 112909;
1492  while (n--) cs = cs * 65713 + a[n] * 1686049;
1493  return cs;
1494 }
1495 
1496 
1497 namespace {
1498  struct ArgsortComparator {
1499  const float *vals;
1500  bool operator() (const size_t a, const size_t b) const {
1501  return vals[a] < vals[b];
1502  }
1503  };
1504 
1505  struct SegmentS {
1506  size_t i0; // begin pointer in the permutation array
1507  size_t i1; // end
1508  size_t len() const {
1509  return i1 - i0;
1510  }
1511  };
1512 
1513  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1514  // extended to > 1 merge thread
1515 
1516  // merges 2 ranges that should be consecutive on the source into
1517  // the union of the two on the destination
1518  template<typename T>
1519  void parallel_merge (const T *src, T *dst,
1520  SegmentS &s1, SegmentS & s2, int nt,
1521  const ArgsortComparator & comp) {
1522  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1523  std::swap(s1, s2);
1524  }
1525 
1526  // compute sub-ranges for each thread
1527  SegmentS s1s[nt], s2s[nt], sws[nt];
1528  s2s[0].i0 = s2.i0;
1529  s2s[nt - 1].i1 = s2.i1;
1530 
1531  // not sure parallel actually helps here
1532 #pragma omp parallel for num_threads(nt)
1533  for (int t = 0; t < nt; t++) {
1534  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1535  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1536 
1537  if (t + 1 < nt) {
1538  T pivot = src[s1s[t].i1];
1539  size_t i0 = s2.i0, i1 = s2.i1;
1540  while (i0 + 1 < i1) {
1541  size_t imed = (i1 + i0) / 2;
1542  if (comp (pivot, src[imed])) {i1 = imed; }
1543  else {i0 = imed; }
1544  }
1545  s2s[t].i1 = s2s[t + 1].i0 = i1;
1546  }
1547  }
1548  s1.i0 = std::min(s1.i0, s2.i0);
1549  s1.i1 = std::max(s1.i1, s2.i1);
1550  s2 = s1;
1551  sws[0].i0 = s1.i0;
1552  for (int t = 0; t < nt; t++) {
1553  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1554  if (t + 1 < nt) {
1555  sws[t + 1].i0 = sws[t].i1;
1556  }
1557  }
1558  assert(sws[nt - 1].i1 == s1.i1);
1559 
1560  // do the actual merging
1561 #pragma omp parallel for num_threads(nt)
1562  for (int t = 0; t < nt; t++) {
1563  SegmentS sw = sws[t];
1564  SegmentS s1t = s1s[t];
1565  SegmentS s2t = s2s[t];
1566  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1567  for (;;) {
1568  // assert (sw.len() == s1t.len() + s2t.len());
1569  if (comp(src[s1t.i0], src[s2t.i0])) {
1570  dst[sw.i0++] = src[s1t.i0++];
1571  if (s1t.i0 == s1t.i1) break;
1572  } else {
1573  dst[sw.i0++] = src[s2t.i0++];
1574  if (s2t.i0 == s2t.i1) break;
1575  }
1576  }
1577  }
1578  if (s1t.len() > 0) {
1579  assert(s1t.len() == sw.len());
1580  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1581  } else if (s2t.len() > 0) {
1582  assert(s2t.len() == sw.len());
1583  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1584  }
1585  }
1586  }
1587 
1588 };
1589 
1590 void fvec_argsort (size_t n, const float *vals,
1591  size_t *perm)
1592 {
1593  for (size_t i = 0; i < n; i++) perm[i] = i;
1594  ArgsortComparator comp = {vals};
1595  std::sort (perm, perm + n, comp);
1596 }
1597 
1598 void fvec_argsort_parallel (size_t n, const float *vals,
1599  size_t *perm)
1600 {
1601  size_t * perm2 = new size_t[n];
1602  // 2 result tables, during merging, flip between them
1603  size_t *permB = perm2, *permA = perm;
1604 
1605  int nt = omp_get_max_threads();
1606  { // prepare correct permutation so that the result ends in perm
1607  // at final iteration
1608  int nseg = nt;
1609  while (nseg > 1) {
1610  nseg = (nseg + 1) / 2;
1611  std::swap (permA, permB);
1612  }
1613  }
1614 
1615 #pragma omp parallel
1616  for (size_t i = 0; i < n; i++) permA[i] = i;
1617 
1618  ArgsortComparator comp = {vals};
1619 
1620  SegmentS segs[nt];
1621 
1622  // independent sorts
1623 #pragma omp parallel for
1624  for (int t = 0; t < nt; t++) {
1625  size_t i0 = t * n / nt;
1626  size_t i1 = (t + 1) * n / nt;
1627  SegmentS seg = {i0, i1};
1628  std::sort (permA + seg.i0, permA + seg.i1, comp);
1629  segs[t] = seg;
1630  }
1631  int prev_nested = omp_get_nested();
1632  omp_set_nested(1);
1633 
1634  int nseg = nt;
1635  while (nseg > 1) {
1636  int nseg1 = (nseg + 1) / 2;
1637  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1638  int sub_nseg1 = nseg / 2;
1639 
1640 #pragma omp parallel for num_threads(nseg1)
1641  for (int s = 0; s < nseg; s += 2) {
1642  if (s + 1 == nseg) { // otherwise isolated segment
1643  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1644  segs[s].len() * sizeof(size_t));
1645  } else {
1646  int t0 = s * sub_nt / sub_nseg1;
1647  int t1 = (s + 1) * sub_nt / sub_nseg1;
1648  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1649  parallel_merge(permA, permB, segs[s], segs[s + 1],
1650  t1 - t0, comp);
1651  }
1652  }
1653  for (int s = 0; s < nseg; s += 2)
1654  segs[s / 2] = segs[s];
1655  nseg = nseg1;
1656  std::swap (permA, permB);
1657  }
1658  assert (permA == perm);
1659  omp_set_nested(prev_nested);
1660  delete [] perm2;
1661 }
1662 
1663 
1664 
1665 
1666 
1667 
1668 
1669 
1670 
1671 
1672 
1673 
1674 
1675 
1676 
1677 
1678 /***************************************************************************
1679  * heavily optimized table computations
1680  ***************************************************************************/
1681 
1682 
1683 static inline void fvec_madd_ref (size_t n, const float *a,
1684  float bf, const float *b, float *c) {
1685  for (size_t i = 0; i < n; i++)
1686  c[i] = a[i] + bf * b[i];
1687 }
1688 
1689 
1690 static inline void fvec_madd_sse (size_t n, const float *a,
1691  float bf, const float *b, float *c) {
1692  n >>= 2;
1693  __m128 bf4 = _mm_set_ps1 (bf);
1694  __m128 * a4 = (__m128*)a;
1695  __m128 * b4 = (__m128*)b;
1696  __m128 * c4 = (__m128*)c;
1697 
1698  while (n--) {
1699  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1700  b4++;
1701  a4++;
1702  c4++;
1703  }
1704 }
1705 
1706 void fvec_madd (size_t n, const float *a,
1707  float bf, const float *b, float *c)
1708 {
1709  if ((n & 3) == 0 &&
1710  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1711  fvec_madd_sse (n, a, bf, b, c);
1712  else
1713  fvec_madd_ref (n, a, bf, b, c);
1714 }
1715 
1716 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1717  float bf, const float *b, float *c) {
1718  float vmin = 1e20;
1719  int imin = -1;
1720 
1721  for (size_t i = 0; i < n; i++) {
1722  c[i] = a[i] + bf * b[i];
1723  if (c[i] < vmin) {
1724  vmin = c[i];
1725  imin = i;
1726  }
1727  }
1728  return imin;
1729 }
1730 
1731 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1732  float bf, const float *b, float *c) {
1733  n >>= 2;
1734  __m128 bf4 = _mm_set_ps1 (bf);
1735  __m128 vmin4 = _mm_set_ps1 (1e20);
1736  __m128i imin4 = _mm_set1_epi32 (-1);
1737  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1738  __m128i inc4 = _mm_set1_epi32 (4);
1739  __m128 * a4 = (__m128*)a;
1740  __m128 * b4 = (__m128*)b;
1741  __m128 * c4 = (__m128*)c;
1742 
1743  while (n--) {
1744  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1745  *c4 = vc4;
1746  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1747  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1748 
1749  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1750  _mm_andnot_si128 (mask, imin4));
1751  vmin4 = _mm_min_ps (vmin4, vc4);
1752  b4++;
1753  a4++;
1754  c4++;
1755  idx4 = _mm_add_epi32 (idx4, inc4);
1756  }
1757 
1758  // 4 values -> 2
1759  {
1760  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1761  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1762  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1763  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1764  _mm_andnot_si128 (mask, imin4));
1765  vmin4 = _mm_min_ps (vmin4, vc4);
1766  }
1767  // 2 values -> 1
1768  {
1769  idx4 = _mm_shuffle_epi32 (imin4, 1);
1770  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1771  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1772  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1773  _mm_andnot_si128 (mask, imin4));
1774  // vmin4 = _mm_min_ps (vmin4, vc4);
1775  }
1776  return _mm_extract_epi32 (imin4, 0);
1777 }
1778 
1779 
1780 int fvec_madd_and_argmin (size_t n, const float *a,
1781  float bf, const float *b, float *c)
1782 {
1783  if ((n & 3) == 0 &&
1784  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1785  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1786  else
1787  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1788 }
1789 
1790 
1791 
1792 
1793 
1794 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:49
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:871
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
Definition: utils.cpp:1285
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:431
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1466
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1377
float rand_float()
between 0 and 1
Definition: utils.cpp:209
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1706
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1455
long rand_long()
random long &lt; 2 ^ 62
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1393
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1229
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1167
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:831
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:72
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1431
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:512
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1152
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1207
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1489
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1780
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:851