Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 // -*- c++ -*-
12 
13 #include "utils.h"
14 
15 #include <cstdio>
16 #include <cassert>
17 #include <cstring>
18 #include <cmath>
19 
20 #include <smmintrin.h>
21 #include <mmintrin.h>
22 
23 #include <sys/time.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26 
27 #include <omp.h>
28 
29 
30 #include <algorithm>
31 #include <vector>
32 
33 #include "AuxIndexStructures.h"
34 #include "FaissAssert.h"
35 
36 
37 
38 #ifndef FINTEGER
39 #define FINTEGER long
40 #endif
41 
42 
43 extern "C" {
44 
45 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
46 
47 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
48  n, FINTEGER *k, const float *alpha, const float *a,
49  FINTEGER *lda, const float *b, FINTEGER *
50  ldb, float *beta, float *c, FINTEGER *ldc);
51 
52 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
53 
54 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
55  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
56 
57 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
58  FINTEGER *lda, float *tau, float *work,
59  FINTEGER *lwork, FINTEGER *info);
60 
61 
62 }
63 
64 
65 /**************************************************
66  * Get some stats about the system
67  **************************************************/
68 
69 namespace faiss {
70 
71 double getmillisecs () {
72  struct timeval tv;
73  gettimeofday (&tv, nullptr);
74  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
75 }
76 
77 
78 #ifdef __linux__
79 
80 size_t get_mem_usage_kb ()
81 {
82  int pid = getpid ();
83  char fname[256];
84  snprintf (fname, 256, "/proc/%d/status", pid);
85  FILE * f = fopen (fname, "r");
86  FAISS_ASSERT (f || !"cannot open proc status file") ;
87  size_t sz = 0;
88  for (;;) {
89  char buf [256];
90  if (!fgets (buf, 256, f)) break;
91  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
92  }
93  fclose (f);
94  return sz;
95 }
96 
97 #elif __APPLE__
98 
99 size_t get_mem_usage_kb ()
100 {
101  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
102  return 0;
103 }
104 
105 #endif
106 
107 
108 
109 /**************************************************
110  * Random data generation functions
111  **************************************************/
112 
113 /**
114  * The definition of random functions depends on the architecture:
115  *
116  * - for Linux, we rely on re-entrant functions (random_r). This
117  * provides good quality reproducible random sequences.
118  *
119  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
120  * this function that it removed its definition form stdlib.h, so we
121  * re-declare it below. Fortunately, since it is deprecated, its
122  * prototype should not change much in the forerseeable future.
123  *
124  * Unfortunately, system designers are more concerned with making the
125  * most unpredictable random sequences for cryptographic use, when in
126  * scientific contexts what acutally matters is having reproducible
127  * squences in multi-threaded contexts.
128  */
129 
130 
131 #ifdef __linux__
132 
133 
134 
135 
137 {
138  int32_t a;
139  random_r (&rand_data, &a);
140  return a;
141 }
142 
144 {
145  int32_t a, b;
146  random_r (&rand_data, &a);
147  random_r (&rand_data, &b);
148  return long(a) | long(b) << 31;
149 }
150 
151 
153 {
154  memset (&rand_data, 0, sizeof (rand_data));
155  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
156 }
157 
158 
159 RandomGenerator::RandomGenerator (const RandomGenerator & other)
160 {
161  memcpy (rand_state, other.rand_state, sizeof(rand_state));
162  rand_data = other.rand_data;
163  setstate_r (rand_state, &rand_data);
164 }
165 
166 
167 #elif __APPLE__
168 
169 extern "C" {
170 int rand_r(unsigned *seed);
171 }
172 
174 {
175  rand_state = seed;
176 }
177 
178 
179 RandomGenerator::RandomGenerator (const RandomGenerator & other)
180 {
181  rand_state = other.rand_state;
182 }
183 
184 
186 {
187  // RAND_MAX is 31 bits
188  // try to add more randomness in the lower bits
189  int lowbits = rand_r(&rand_state) >> 15;
190  return rand_r(&rand_state) ^ lowbits;
191 }
192 
194 {
195  return long(random()) | long(random()) << 31;
196 }
197 
198 
199 
200 #endif
201 
203 { // this suffers form non-uniform probabilities when max is not a
204  // power of 2, but if RAND_MAX >> max the bias is limited.
205  return rand_int () % max;
206 }
207 
209 {
210  return rand_int() / float(1L << 31);
211 }
212 
213 double RandomGenerator::rand_double ()
214 {
215  return rand_long() / double(1L << 62);
216 }
217 
218 
219 /***********************************************************************
220  * Random functions in this C file only exist because Torch
221  * counterparts are slow and not multi-threaded. Typical use is for
222  * more than 1-100 billion values. */
223 
224 
225 /* Generate a set of random floating point values such that x[i] in [0,1]
226  multi-threading. For this reason, we rely on re-entreant functions. */
227 void float_rand (float * x, size_t n, long seed)
228 {
229  // only try to parallelize on large enough arrays
230  const size_t nblock = n < 1024 ? 1 : 1024;
231 
232  RandomGenerator rng0 (seed);
233  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
234 
235 #pragma omp parallel for
236  for (size_t j = 0; j < nblock; j++) {
237 
238  RandomGenerator rng (a0 + j * b0);
239 
240  const size_t istart = j * n / nblock;
241  const size_t iend = (j + 1) * n / nblock;
242 
243  for (size_t i = istart; i < iend; i++)
244  x[i] = rng.rand_float ();
245  }
246 }
247 
248 
249 void float_randn (float * x, size_t n, long seed)
250 {
251  // only try to parallelize on large enough arrays
252  const size_t nblock = n < 1024 ? 1 : 1024;
253 
254  RandomGenerator rng0 (seed);
255  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
256 
257 #pragma omp parallel for
258  for (size_t j = 0; j < nblock; j++) {
259  RandomGenerator rng (a0 + j * b0);
260 
261  double a = 0, b = 0, s = 0;
262  int state = 0; /* generate two number per "do-while" loop */
263 
264  const size_t istart = j * n / nblock;
265  const size_t iend = (j + 1) * n / nblock;
266 
267  for (size_t i = istart; i < iend; i++) {
268  /* Marsaglia's method (see Knuth) */
269  if (state == 0) {
270  do {
271  a = 2.0 * rng.rand_double () - 1;
272  b = 2.0 * rng.rand_double () - 1;
273  s = a * a + b * b;
274  } while (s >= 1.0);
275  x[i] = a * sqrt(-2.0 * log(s) / s);
276  }
277  else
278  x[i] = b * sqrt(-2.0 * log(s) / s);
279  state = 1 - state;
280  }
281  }
282 }
283 
284 
285 /* Integer versions */
286 void long_rand (long * x, size_t n, long seed)
287 {
288  // only try to parallelize on large enough arrays
289  const size_t nblock = n < 1024 ? 1 : 1024;
290 
291  RandomGenerator rng0 (seed);
292  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
293 
294 #pragma omp parallel for
295  for (size_t j = 0; j < nblock; j++) {
296 
297  RandomGenerator rng (a0 + j * b0);
298 
299  const size_t istart = j * n / nblock;
300  const size_t iend = (j + 1) * n / nblock;
301  for (size_t i = istart; i < iend; i++)
302  x[i] = rng.rand_long ();
303  }
304 }
305 
306 
307 
308 void rand_perm (int *perm, size_t n, long seed)
309 {
310  for (size_t i = 0; i < n; i++) perm[i] = i;
311 
312  RandomGenerator rng (seed);
313 
314  for (size_t i = 0; i + 1 < n; i++) {
315  int i2 = i + rng.rand_int (n - i);
316  std::swap(perm[i], perm[i2]);
317  }
318 }
319 
320 
321 
322 
323 void byte_rand (uint8_t * x, size_t n, long seed)
324 {
325  // only try to parallelize on large enough arrays
326  const size_t nblock = n < 1024 ? 1 : 1024;
327 
328  RandomGenerator rng0 (seed);
329  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
330 
331 #pragma omp parallel for
332  for (size_t j = 0; j < nblock; j++) {
333 
334  RandomGenerator rng (a0 + j * b0);
335 
336  const size_t istart = j * n / nblock;
337  const size_t iend = (j + 1) * n / nblock;
338 
339  size_t i;
340  for (i = istart; i < iend; i++)
341  x[i] = rng.rand_long ();
342  }
343 }
344 
345 
346 
347 void reflection (const float * __restrict u,
348  float * __restrict x,
349  size_t n, size_t d, size_t nu)
350 {
351  size_t i, j, l;
352  for (i = 0; i < n; i++) {
353  const float * up = u;
354  for (l = 0; l < nu; l++) {
355  float ip1 = 0, ip2 = 0;
356 
357  for (j = 0; j < d; j+=2) {
358  ip1 += up[j] * x[j];
359  ip2 += up[j+1] * x[j+1];
360  }
361  float ip = 2 * (ip1 + ip2);
362 
363  for (j = 0; j < d; j++)
364  x[j] -= ip * up[j];
365  up += d;
366  }
367  x += d;
368  }
369 }
370 
371 
372 /* Reference implementation (slower) */
373 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
374 {
375  size_t i, j, l;
376  for (i = 0; i < n; i++) {
377  const float * up = u;
378  for (l = 0; l < nu; l++) {
379  double ip = 0;
380 
381  for (j = 0; j < d; j++)
382  ip += up[j] * x[j];
383  ip *= 2;
384 
385  for (j = 0; j < d; j++)
386  x[j] -= ip * up[j];
387 
388  up += d;
389  }
390  x += d;
391  }
392 }
393 
394 /*********************************************************
395  * Optimized distance computations
396  *********************************************************/
397 
398 
399 
400 /* Functions to compute:
401  - L2 distance between 2 vectors
402  - inner product between 2 vectors
403  - L2 norm of a vector
404 
405  The functions should probably not be invoked when a large number of
406  vectors are be processed in batch (in which case Matrix multiply
407  is faster), but may be useful for comparing vectors isolated in
408  memory.
409 
410  Works with any vectors of any dimension, even unaligned (in which
411  case they are slower).
412 
413 */
414 
415 
416 // set some of the floats in x to 0 so that there are d floats remaining
417 // 0 <= d <= 4
418 static inline __m128 maskout (int d, __m128 x)
419 {
420  // check which values to clear out
421  __m128i d4 = _mm_set1_epi32 (d);
422  __m128i lim = _mm_set_epi32 (3, 2, 1, 0);
423  __m128i mask = _mm_cmpgt_epi32 (d4, lim);
424 
425  return _mm_and_ps (x, (__m128)mask);
426 }
427 
428 
429 /* SSE-implementation of L2 distance */
430 float fvec_L2sqr (const float * x,
431  const float * y,
432  size_t d)
433 {
434  __m128 mx, my;
435  __m128 msum1 = _mm_setzero_ps();
436 
437  while (d > 4) {
438  mx = _mm_loadu_ps (x); x += 4;
439  my = _mm_loadu_ps (y); y += 4;
440  const __m128 a_m_b1 = _mm_sub_ps (mx, my);
441  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
442  d -= 4;
443  }
444 
445  // add the last 1, 2, 3, or 4 values
446  mx = _mm_loadu_ps (x);
447  my = _mm_loadu_ps (y);
448  __m128 a_m_b1 = maskout (d, _mm_sub_ps (mx, my));
449 
450  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
451 
452  msum1 = _mm_hadd_ps (msum1, msum1);
453  msum1 = _mm_hadd_ps (msum1, msum1);
454  return _mm_cvtss_f32 (msum1);
455 }
456 
457 
458 /* same without SSE */
459 float fvec_L2sqr_ref (const float * x,
460  const float * y,
461  size_t d)
462 {
463  size_t i;
464  float res_ = 0;
465  for (i = 0; i < d; i++) {
466  const float tmp = x[i] - y[i];
467  res_ += tmp * tmp;
468  }
469  return res_;
470 }
471 
472 float fvec_inner_product (const float * x,
473  const float * y,
474  size_t d)
475 {
476  __m128 mx, my;
477  __m128 msum1 = _mm_setzero_ps();
478 
479  while (d > 4) {
480  mx = _mm_loadu_ps (x); x += 4;
481  my = _mm_loadu_ps (y); y += 4;
482  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
483  d -= 4;
484  }
485 
486  // add the last 1, 2, 3, or 4 values
487  mx = _mm_loadu_ps (x);
488  my = _mm_loadu_ps (y);
489  __m128 prod = maskout (d, _mm_mul_ps (mx, my));
490 
491  msum1 = _mm_add_ps (msum1, prod);
492 
493  msum1 = _mm_hadd_ps (msum1, msum1);
494  msum1 = _mm_hadd_ps (msum1, msum1);
495  return _mm_cvtss_f32 (msum1);
496 }
497 
498 
499 float fvec_inner_product_ref (const float * x,
500  const float * y,
501  size_t d)
502 {
503  size_t i;
504  float res_ = 0;
505  for (i = 0; i < d; i++)
506  res_ += x[i] * y[i];
507  return res_;
508 }
509 
510 
511 float fvec_norm_L2sqr (const float * x,
512  size_t d)
513 {
514  __m128 mx;
515  __m128 msum1 = _mm_setzero_ps();
516 
517  while (d > 4) {
518  mx = _mm_loadu_ps (x); x += 4;
519  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
520  d -= 4;
521  }
522 
523  mx = maskout (d, _mm_loadu_ps (x));
524  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
525 
526  msum1 = _mm_hadd_ps (msum1, msum1);
527  msum1 = _mm_hadd_ps (msum1, msum1);
528  return _mm_cvtss_f32 (msum1);
529 }
530 
531 
532 float fvec_norm_L2sqr_ref (const float * __restrict x,
533  size_t d)
534 {
535  size_t i;
536  double res_ = 0;
537  for (i = 0; i < d; i++)
538  res_ += x[i] * x[i];
539  return res_;
540 }
541 
542 
543 
544 
545 /* Compute the inner product between a vector x and
546  a set of ny vectors y.
547  These functions are not intended to replace BLAS matrix-matrix, as they
548  would be significantly less efficient in this case. */
549 void fvec_inner_products_ny (float * __restrict ip,
550  const float * x,
551  const float * y,
552  size_t d, size_t ny)
553 {
554  for (size_t i = 0; i < ny; i++) {
555  ip[i] = fvec_inner_product (x, y, d);
556  y += d;
557  }
558 }
559 
560 
561 
562 
563 /* compute ny L2 distances between x and a set of vectors y */
564 void fvec_L2sqr_ny (float * __restrict dis,
565  const float * x,
566  const float * y,
567  size_t d, size_t ny)
568 {
569  for (size_t i = 0; i < ny; i++) {
570  dis[i] = fvec_L2sqr (x, y, d);
571  y += d;
572  }
573 }
574 
575 
576 
577 
578 /* Compute the L2 norm of a set of nx vectors */
579 void fvec_norms_L2 (float * __restrict nr,
580  const float * __restrict x,
581  size_t d, size_t nx)
582 {
583 
584 #pragma omp parallel for
585  for (size_t i = 0; i < nx; i++) {
586  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
587  }
588 }
589 
590 void fvec_norms_L2sqr (float * __restrict nr,
591  const float * __restrict x,
592  size_t d, size_t nx)
593 {
594 #pragma omp parallel for
595  for (size_t i = 0; i < nx; i++)
596  nr[i] = fvec_norm_L2sqr (x + i * d, d);
597 }
598 
599 
600 
601 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
602 {
603 #pragma omp parallel for
604  for (size_t i = 0; i < nx; i++) {
605  float * __restrict xi = x + i * d;
606 
607  float nr = fvec_norm_L2sqr (xi, d);
608 
609  if (nr > 0) {
610  size_t j;
611  const float inv_nr = 1.0 / sqrtf (nr);
612  for (j = 0; j < d; j++)
613  xi[j] *= inv_nr;
614  }
615  }
616 }
617 
618 
619 
620 
621 
622 
623 
624 
625 
626 
627 
628 
629 
630 
631 
632 
633 
634 /***************************************************************************
635  * KNN functions
636  ***************************************************************************/
637 
638 
639 
640 /* Find the nearest neighbors for nx queries in a set of ny vectors */
641 static void knn_inner_product_sse (const float * x,
642  const float * y,
643  size_t d, size_t nx, size_t ny,
644  float_minheap_array_t * res)
645 {
646  size_t k = res->k;
647 
648 #pragma omp parallel for
649  for (size_t i = 0; i < nx; i++) {
650  const float * x_ = x + i * d;
651  const float * y_ = y;
652 
653  float * __restrict simi = res->get_val(i);
654  long * __restrict idxi = res->get_ids (i);
655 
656  minheap_heapify (k, simi, idxi);
657 
658  for (size_t j = 0; j < ny; j++) {
659  float ip = fvec_inner_product (x_, y_, d);
660 
661  if (ip > simi[0]) {
662  minheap_pop (k, simi, idxi);
663  minheap_push (k, simi, idxi, ip, j);
664  }
665  y_ += d;
666  }
667  minheap_reorder (k, simi, idxi);
668  }
669 
670 }
671 
672 static void knn_L2sqr_sse (
673  const float * x,
674  const float * y,
675  size_t d, size_t nx, size_t ny,
676  float_maxheap_array_t * res)
677 {
678  size_t k = res->k;
679 
680 #pragma omp parallel for
681  for (size_t i = 0; i < nx; i++) {
682  const float * x_ = x + i * d;
683  const float * y_ = y;
684  size_t j;
685  float * __restrict simi = res->get_val(i);
686  long * __restrict idxi = res->get_ids (i);
687 
688  maxheap_heapify (k, simi, idxi);
689  for (j = 0; j < ny; j++) {
690  float disij = fvec_L2sqr (x_, y_, d);
691 
692  if (disij < simi[0]) {
693  maxheap_pop (k, simi, idxi);
694  maxheap_push (k, simi, idxi, disij, j);
695  }
696  y_ += d;
697  }
698  maxheap_reorder (k, simi, idxi);
699  }
700 
701 }
702 
703 
704 /** Find the nearest neighbors for nx queries in a set of ny vectors */
705 static void knn_inner_product_blas (
706  const float * x,
707  const float * y,
708  size_t d, size_t nx, size_t ny,
709  float_minheap_array_t * res)
710 {
711  res->heapify ();
712 
713  // BLAS does not like empty matrices
714  if (nx == 0 || ny == 0) return;
715 
716  /* block sizes */
717  const size_t bs_x = 4096, bs_y = 1024;
718  // const size_t bs_x = 16, bs_y = 16;
719  float *ip_block = new float[bs_x * bs_y];
720 
721  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
722  size_t i1 = i0 + bs_x;
723  if(i1 > nx) i1 = nx;
724 
725  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
726  size_t j1 = j0 + bs_y;
727  if (j1 > ny) j1 = ny;
728  /* compute the actual dot products */
729  {
730  float one = 1, zero = 0;
731  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
732  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
733  y + j0 * d, &di,
734  x + i0 * d, &di, &zero,
735  ip_block, &nyi);
736  }
737 
738  /* collect maxima */
739  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
740  }
741  }
742  delete [] ip_block;
743  res->reorder ();
744 }
745 
746 // distance correction is an operator that can be applied to transform
747 // the distances
748 template<class DistanceCorrection>
749 static void knn_L2sqr_blas (const float * x,
750  const float * y,
751  size_t d, size_t nx, size_t ny,
752  float_maxheap_array_t * res,
753  const DistanceCorrection &corr)
754 {
755  res->heapify ();
756 
757  // BLAS does not like empty matrices
758  if (nx == 0 || ny == 0) return;
759 
760  size_t k = res->k;
761 
762  /* block sizes */
763  const size_t bs_x = 4096, bs_y = 1024;
764  // const size_t bs_x = 16, bs_y = 16;
765  float *ip_block = new float[bs_x * bs_y];
766 
767  float *x_norms = new float[nx];
768  fvec_norms_L2sqr (x_norms, x, d, nx);
769 
770  float *y_norms = new float[ny];
771  fvec_norms_L2sqr (y_norms, y, d, ny);
772 
773  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
774  size_t i1 = i0 + bs_x;
775  if(i1 > nx) i1 = nx;
776 
777  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
778  size_t j1 = j0 + bs_y;
779  if (j1 > ny) j1 = ny;
780  /* compute the actual dot products */
781  {
782  float one = 1, zero = 0;
783  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
784  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
785  y + j0 * d, &di,
786  x + i0 * d, &di, &zero,
787  ip_block, &nyi);
788  }
789 
790  /* collect minima */
791 #pragma omp parallel for
792  for (size_t i = i0; i < i1; i++) {
793  float * __restrict simi = res->get_val(i);
794  long * __restrict idxi = res->get_ids (i);
795  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
796 
797  for (size_t j = j0; j < j1; j++) {
798  float ip = *ip_line++;
799  float dis = x_norms[i] + y_norms[j] - 2 * ip;
800 
801  dis = corr (dis, i, j);
802 
803  if (dis < simi[0]) {
804  maxheap_pop (k, simi, idxi);
805  maxheap_push (k, simi, idxi, dis, j);
806  }
807  }
808  }
809  }
810  }
811  res->reorder ();
812 
813  delete [] ip_block;
814  delete [] x_norms;
815  delete [] y_norms;
816 }
817 
818 
819 
820 
821 
822 
823 
824 
825 
826 /*******************************************************
827  * KNN driver functions
828  *******************************************************/
829 
830 void knn_inner_product (const float * x,
831  const float * y,
832  size_t d, size_t nx, size_t ny,
833  float_minheap_array_t * res)
834 {
835  if (d % 4 == 0 && nx < 20) {
836  knn_inner_product_sse (x, y, d, nx, ny, res);
837  } else {
838  knn_inner_product_blas (x, y, d, nx, ny, res);
839  }
840 }
841 
842 
843 
845  float operator()(float dis, size_t qno, size_t bno) const {
846  return dis;
847  }
848 };
849 
850 void knn_L2sqr (const float * x,
851  const float * y,
852  size_t d, size_t nx, size_t ny,
853  float_maxheap_array_t * res)
854 {
855  if (d % 4 == 0 && nx < 20) {
856  knn_L2sqr_sse (x, y, d, nx, ny, res);
857  } else {
859  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
860  }
861 }
862 
864  const float *base_shift;
865  float operator()(float dis, size_t qno, size_t bno) const {
866  return dis - base_shift[bno];
867  }
868 };
869 
871  const float * x,
872  const float * y,
873  size_t d, size_t nx, size_t ny,
874  float_maxheap_array_t * res,
875  const float *base_shift)
876 {
877  BaseShiftDistanceCorrection corr = {base_shift};
878  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
879 }
880 
881 
882 
883 /***************************************************************************
884  * compute a subset of distances
885  ***************************************************************************/
886 
887 /* compute the inner product between x and a subset y of ny vectors,
888  whose indices are given by idy. */
889 void fvec_inner_products_by_idx (float * __restrict ip,
890  const float * x,
891  const float * y,
892  const long * __restrict ids, /* for y vecs */
893  size_t d, size_t nx, size_t ny)
894 {
895 #pragma omp parallel for
896  for (size_t j = 0; j < nx; j++) {
897  const long * __restrict idsj = ids + j * ny;
898  const float * xj = x + j * d;
899  float * __restrict ipj = ip + j * ny;
900  for (size_t i = 0; i < ny; i++) {
901  if (idsj[i] < 0)
902  continue;
903  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
904  }
905  }
906 }
907 
908 /* compute the inner product between x and a subset y of ny vectors,
909  whose indices are given by idy. */
910 void fvec_L2sqr_by_idx (float * __restrict dis,
911  const float * x,
912  const float * y,
913  const long * __restrict ids, /* ids of y vecs */
914  size_t d, size_t nx, size_t ny)
915 {
916 #pragma omp parallel for
917  for (size_t j = 0; j < nx; j++) {
918  const long * __restrict idsj = ids + j * ny;
919  const float * xj = x + j * d;
920  float * __restrict disj = dis + j * ny;
921  for (size_t i = 0; i < ny; i++) {
922  if (idsj[i] < 0)
923  continue;
924  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
925  }
926  }
927 }
928 
929 
930 
931 
932 
933 /* Find the nearest neighbors for nx queries in a set of ny vectors
934  indexed by ids. May be useful for re-ranking a pre-selected vector list */
935 void knn_inner_products_by_idx (const float * x,
936  const float * y,
937  const long * __restrict ids,
938  size_t d, size_t nx, size_t ny,
939  float_minheap_array_t * res)
940 {
941  size_t k = res->k;
942 
943 
944 
945 #pragma omp parallel for
946  for (size_t i = 0; i < nx; i++) {
947  const float * x_ = x + i * d;
948  const long * idsi = ids + i * ny;
949  size_t j;
950  float * __restrict simi = res->get_val(i);
951  long * __restrict idxi = res->get_ids (i);
952  minheap_heapify (k, simi, idxi);
953 
954  for (j = 0; j < ny; j++) {
955  if (idsi[j] < 0) break;
956  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
957 
958  if (ip > simi[0]) {
959  minheap_pop (k, simi, idxi);
960  minheap_push (k, simi, idxi, ip, idsi[j]);
961  }
962  }
963  minheap_reorder (k, simi, idxi);
964  }
965 
966 }
967 
968 void knn_L2sqr_by_idx (const float * x,
969  const float * y,
970  const long * __restrict ids,
971  size_t d, size_t nx, size_t ny,
972  float_maxheap_array_t * res)
973 {
974  size_t k = res->k;
975 
976 #pragma omp parallel for
977  for (size_t i = 0; i < nx; i++) {
978  const float * x_ = x + i * d;
979  const long * __restrict idsi = ids + i * ny;
980  float * __restrict simi = res->get_val(i);
981  long * __restrict idxi = res->get_ids (i);
982  maxheap_heapify (res->k, simi, idxi);
983  for (size_t j = 0; j < ny; j++) {
984  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
985 
986  if (disij < simi[0]) {
987  maxheap_pop (k, simi, idxi);
988  maxheap_push (k, simi, idxi, disij, idsi[j]);
989  }
990  }
991  maxheap_reorder (res->k, simi, idxi);
992  }
993 
994 }
995 
996 
997 
998 
999 
1000 /***************************************************************************
1001  * Range search
1002  ***************************************************************************/
1003 
1004 /** Find the nearest neighbors for nx queries in a set of ny vectors
1005  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1006  */
1007 template <bool compute_l2>
1008 static void range_search_blas (
1009  const float * x,
1010  const float * y,
1011  size_t d, size_t nx, size_t ny,
1012  float radius,
1013  RangeSearchResult *result)
1014 {
1015 
1016  // BLAS does not like empty matrices
1017  if (nx == 0 || ny == 0) return;
1018 
1019  /* block sizes */
1020  const size_t bs_x = 4096, bs_y = 1024;
1021  // const size_t bs_x = 16, bs_y = 16;
1022  float *ip_block = new float[bs_x * bs_y];
1023 
1024  float *x_norms = nullptr, *y_norms = nullptr;
1025 
1026  if (compute_l2) {
1027  x_norms = new float[nx];
1028  fvec_norms_L2sqr (x_norms, x, d, nx);
1029  y_norms = new float[ny];
1030  fvec_norms_L2sqr (y_norms, y, d, ny);
1031  }
1032 
1033  std::vector <RangeSearchPartialResult *> partial_results;
1034 
1035  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1036  size_t j1 = j0 + bs_y;
1037  if (j1 > ny) j1 = ny;
1038  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1039  partial_results.push_back (pres);
1040 
1041  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1042  size_t i1 = i0 + bs_x;
1043  if(i1 > nx) i1 = nx;
1044 
1045  /* compute the actual dot products */
1046  {
1047  float one = 1, zero = 0;
1048  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1049  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1050  y + j0 * d, &di,
1051  x + i0 * d, &di, &zero,
1052  ip_block, &nyi);
1053  }
1054 
1055 
1056  for (size_t i = i0; i < i1; i++) {
1057  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1058 
1059  RangeSearchPartialResult::QueryResult & qres =
1060  pres->new_result (i);
1061 
1062  for (size_t j = j0; j < j1; j++) {
1063  float ip = *ip_line++;
1064  if (compute_l2) {
1065  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1066  if (dis < radius) {
1067  qres.add (dis, j);
1068  }
1069  } else {
1070  if (ip > radius) {
1071  qres.add (ip, j);
1072  }
1073  }
1074  }
1075  }
1076  }
1077 
1078  }
1079  delete [] ip_block;
1080  delete [] x_norms;
1081  delete [] y_norms;
1082 
1083  { // merge the partial results
1084  int npres = partial_results.size();
1085  // count
1086  for (size_t i = 0; i < nx; i++) {
1087  for (int j = 0; j < npres; j++)
1088  result->lims[i] += partial_results[j]->queries[i].nres;
1089  }
1090  result->do_allocation ();
1091  for (int j = 0; j < npres; j++) {
1092  partial_results[j]->set_result (true);
1093  delete partial_results[j];
1094  }
1095 
1096  // reset the limits
1097  for (size_t i = nx; i > 0; i--) {
1098  result->lims [i] = result->lims [i - 1];
1099  }
1100  result->lims [0] = 0;
1101  }
1102 }
1103 
1104 
1105 template <bool compute_l2>
1106 static void range_search_sse (const float * x,
1107  const float * y,
1108  size_t d, size_t nx, size_t ny,
1109  float radius,
1110  RangeSearchResult *res)
1111 {
1112  FAISS_ASSERT (d % 4 == 0);
1113 
1114 #pragma omp parallel
1115  {
1116  RangeSearchPartialResult pres (res);
1117 
1118 #pragma omp for
1119  for (size_t i = 0; i < nx; i++) {
1120  const float * x_ = x + i * d;
1121  const float * y_ = y;
1122  size_t j;
1123 
1124  RangeSearchPartialResult::QueryResult & qres =
1125  pres.new_result (i);
1126 
1127  for (j = 0; j < ny; j++) {
1128  if (compute_l2) {
1129  float disij = fvec_L2sqr (x_, y_, d);
1130  if (disij < radius) {
1131  qres.add (disij, j);
1132  }
1133  } else {
1134  float ip = fvec_inner_product (x_, y_, d);
1135  if (ip > radius) {
1136  qres.add (ip, j);
1137  }
1138  }
1139  y_ += d;
1140  }
1141 
1142  }
1143  pres.finalize ();
1144  }
1145 }
1146 
1147 
1148 
1149 
1150 
1152  const float * x,
1153  const float * y,
1154  size_t d, size_t nx, size_t ny,
1155  float radius,
1156  RangeSearchResult *res)
1157 {
1158 
1159  if (d % 4 == 0 && nx < 20) {
1160  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1161  } else {
1162  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1163  }
1164 }
1165 
1167  const float * x,
1168  const float * y,
1169  size_t d, size_t nx, size_t ny,
1170  float radius,
1171  RangeSearchResult *res)
1172 {
1173 
1174  if (d % 4 == 0 && nx < 20) {
1175  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1176  } else {
1177  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1178  }
1179 }
1180 
1181 
1182 
1183 /***************************************************************************
1184  * Some matrix manipulation functions
1185  ***************************************************************************/
1186 
1187 
1188 /* This function exists because the Torch counterpart is extremly slow
1189  (not multi-threaded + unexpected overhead even in single thread).
1190  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1191 void inner_product_to_L2sqr (float * __restrict dis,
1192  const float * nr1,
1193  const float * nr2,
1194  size_t n1, size_t n2)
1195 {
1196 
1197 #pragma omp parallel for
1198  for (size_t j = 0 ; j < n1 ; j++) {
1199  float * disj = dis + j * n2;
1200  for (size_t i = 0 ; i < n2 ; i++)
1201  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1202  }
1203 }
1204 
1205 
1206 void matrix_qr (int m, int n, float *a)
1207 {
1208  FAISS_ASSERT(m >= n);
1209  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1210  std::vector<float> tau (ki);
1211  FINTEGER lwork = -1, info;
1212  float work_size;
1213 
1214  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1215  &work_size, &lwork, &info);
1216  lwork = size_t(work_size);
1217  std::vector<float> work (lwork);
1218 
1219  sgeqrf_ (&mi, &ni, a, &mi,
1220  tau.data(), work.data(), &lwork, &info);
1221 
1222  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1223  work.data(), &lwork, &info);
1224 
1225 }
1226 
1227 
1228 void pairwise_L2sqr (long d,
1229  long nq, const float *xq,
1230  long nb, const float *xb,
1231  float *dis,
1232  long ldq, long ldb, long ldd)
1233 {
1234  if (nq == 0 || nb == 0) return;
1235  if (ldq == -1) ldq = d;
1236  if (ldb == -1) ldb = d;
1237  if (ldd == -1) ldd = nb;
1238 
1239  // store in beginning of distance matrix to avoid malloc
1240  float *b_norms = dis;
1241 
1242 #pragma omp parallel for
1243  for (long i = 0; i < nb; i++)
1244  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1245 
1246 #pragma omp parallel for
1247  for (long i = 1; i < nq; i++) {
1248  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1249  for (long j = 0; j < nb; j++)
1250  dis[i * ldd + j] = q_norm + b_norms [j];
1251  }
1252 
1253  {
1254  float q_norm = fvec_norm_L2sqr (xq, d);
1255  for (long j = 0; j < nb; j++)
1256  dis[j] += q_norm;
1257  }
1258 
1259  {
1260  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1261  float one = 1.0, minus_2 = -2.0;
1262 
1263  sgemm_ ("Transposed", "Not transposed",
1264  &nbi, &nqi, &di,
1265  &minus_2,
1266  xb, &ldbi,
1267  xq, &ldqi,
1268  &one, dis, &lddi);
1269  }
1270 
1271 
1272 }
1273 
1274 
1275 
1276 /***************************************************************************
1277  * Kmeans subroutine
1278  ***************************************************************************/
1279 
1280 // a bit above machine epsilon for float16
1281 
1282 #define EPS (1 / 1024.)
1283 
1284 /* For k-means, compute centroids given assignment of vectors to centroids */
1285 /* NOTE: This could be multi-threaded (use histogram of indexes) */
1286 int km_update_centroids (const float * x,
1287  float * centroids,
1288  long * assign,
1289  size_t d, size_t k, size_t n)
1290 {
1291  std::vector<size_t> hassign(k);
1292  memset (centroids, 0, sizeof(*centroids) * d * k);
1293 
1294 
1295 #pragma omp parallel
1296  {
1297  int nt = omp_get_num_threads();
1298  int rank = omp_get_thread_num();
1299  // this thread is taking care of centroids c0:c1
1300  size_t c0 = (k * rank) / nt;
1301  size_t c1 = (k * (rank + 1)) / nt;
1302  const float *xi = x;
1303  // printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
1304  size_t nacc = 0;
1305 
1306  for (size_t i = 0; i < n; i++) {
1307  long ci = assign[i];
1308  assert (ci >= 0 && ci < k);
1309  if (ci >= c0 && ci < c1) {
1310  float * c = centroids + ci * d;
1311  hassign[ci]++;
1312  for (size_t j = 0; j < d; j++)
1313  c[j] += xi[j];
1314  nacc++;
1315  }
1316  xi += d;
1317  }
1318  // printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
1319 
1320  }
1321 
1322 #pragma omp parallel for
1323  for (size_t ci = 0; ci < k; ci++) {
1324  float * c = centroids + ci * d;
1325  float ni = (float) hassign[ci];
1326  if (ni != 0) {
1327  for (size_t j = 0; j < d; j++)
1328  c[j] /= ni;
1329  }
1330  }
1331 
1332  /* Take care of void clusters */
1333  size_t nsplit = 0;
1334  RandomGenerator rng (1234);
1335  for (size_t ci = 0; ci < k; ci++) {
1336  if (hassign[ci] == 0) { /* need to redefine a centroid */
1337  size_t cj;
1338  for (cj = 0; 1; cj = (cj+1) % k) {
1339  /* probability to pick this cluster for split */
1340  float p = (hassign[cj] - 1.0) / (float) (n - k);
1341  float r = rng.rand_float ();
1342  if (r < p) {
1343  break; /* found our cluster to be split */
1344  }
1345  }
1346  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1347 
1348  /* small symmetric pertubation. Much better than */
1349  for (size_t j = 0; j < d; j++)
1350  if (j % 2 == 0) {
1351  centroids[ci * d + j] *= 1 + EPS;
1352  centroids[cj * d + j] *= 1 - EPS;
1353  }
1354  else {
1355  centroids[ci * d + j] *= 1 + EPS;
1356  centroids[cj * d + j] *= 1 - EPS;
1357  }
1358 
1359  /* assume even split of the cluster */
1360  hassign[ci] = hassign[cj] / 2;
1361  hassign[cj] -= hassign[ci];
1362  nsplit++;
1363  }
1364  }
1365 
1366  return nsplit;
1367 }
1368 
1369 #undef EPS
1370 
1371 
1372 
1373 /***************************************************************************
1374  * Result list routines
1375  ***************************************************************************/
1376 
1377 
1378 void ranklist_handle_ties (int k, long *idx, const float *dis)
1379 {
1380  float prev_dis = -1e38;
1381  int prev_i = -1;
1382  for (int i = 0; i < k; i++) {
1383  if (dis[i] != prev_dis) {
1384  if (i > prev_i + 1) {
1385  // sort between prev_i and i - 1
1386  std::sort (idx + prev_i, idx + i);
1387  }
1388  prev_i = i;
1389  prev_dis = dis[i];
1390  }
1391  }
1392 }
1393 
1394 size_t ranklist_intersection_size (size_t k1, const long *v1,
1395  size_t k2, const long *v2_in)
1396 {
1397  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1398  long *v2 = new long [k2];
1399  memcpy (v2, v2_in, sizeof (long) * k2);
1400  std::sort (v2, v2 + k2);
1401  { // de-dup v2
1402  long prev = -1;
1403  size_t wp = 0;
1404  for (size_t i = 0; i < k2; i++) {
1405  if (v2 [i] != prev) {
1406  v2[wp++] = prev = v2 [i];
1407  }
1408  }
1409  k2 = wp;
1410  }
1411  const long seen_flag = 1L << 60;
1412  size_t count = 0;
1413  for (size_t i = 0; i < k1; i++) {
1414  long q = v1 [i];
1415  size_t i0 = 0, i1 = k2;
1416  while (i0 + 1 < i1) {
1417  size_t imed = (i1 + i0) / 2;
1418  long piv = v2 [imed] & ~seen_flag;
1419  if (piv <= q) i0 = imed;
1420  else i1 = imed;
1421  }
1422  if (v2 [i0] == q) {
1423  count++;
1424  v2 [i0] |= seen_flag;
1425  }
1426  }
1427  delete [] v2;
1428 
1429  return count;
1430 }
1431 
1432 double imbalance_factor (int k, const int *hist) {
1433  double tot = 0, uf = 0;
1434 
1435  for (int i = 0 ; i < k ; i++) {
1436  tot += hist[i];
1437  uf += hist[i] * (double) hist[i];
1438  }
1439  uf = uf * k / (tot * tot);
1440 
1441  return uf;
1442 }
1443 
1444 
1445 double imbalance_factor (int n, int k, const long *assign) {
1446  std::vector<int> hist(k, 0);
1447  for (int i = 0; i < n; i++) {
1448  hist[assign[i]]++;
1449  }
1450 
1451  return imbalance_factor (k, hist.data());
1452 }
1453 
1454 
1455 
1456 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1457  memset (hist, 0, sizeof(hist[0]) * vmax);
1458  int nout = 0;
1459  while (n--) {
1460  if (v[n] < 0 || v[n] >= vmax) nout++;
1461  else hist[v[n]]++;
1462  }
1463  return nout;
1464 }
1465 
1466 
1467 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1468 {
1469  FAISS_ASSERT (nbits % 8 == 0);
1470  size_t d = nbits / 8;
1471  std::vector<int> accu(d * 256);
1472  const uint8_t *c = codes;
1473  for (size_t i = 0; i < n; i++)
1474  for(int j = 0; j < d; j++)
1475  accu[j * 256 + *c++]++;
1476  memset (hist, 0, sizeof(*hist) * nbits);
1477  for (int i = 0; i < d; i++) {
1478  const int *ai = accu.data() + i * 256;
1479  int * hi = hist + i * 8;
1480  for (int j = 0; j < 256; j++)
1481  for (int k = 0; k < 8; k++)
1482  if ((j >> k) & 1)
1483  hi[k] += ai[j];
1484  }
1485 
1486 }
1487 
1488 
1489 
1490 size_t ivec_checksum (size_t n, const int *a)
1491 {
1492  size_t cs = 112909;
1493  while (n--) cs = cs * 65713 + a[n] * 1686049;
1494  return cs;
1495 }
1496 
1497 
1498 namespace {
1499  struct ArgsortComparator {
1500  const float *vals;
1501  bool operator() (const size_t a, const size_t b) const {
1502  return vals[a] < vals[b];
1503  }
1504  };
1505 
1506  struct SegmentS {
1507  size_t i0; // begin pointer in the permutation array
1508  size_t i1; // end
1509  size_t len() const {
1510  return i1 - i0;
1511  }
1512  };
1513 
1514  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1515  // extended to > 1 merge thread
1516 
1517  // merges 2 ranges that should be consecutive on the source into
1518  // the union of the two on the destination
1519  template<typename T>
1520  void parallel_merge (const T *src, T *dst,
1521  SegmentS &s1, SegmentS & s2, int nt,
1522  const ArgsortComparator & comp) {
1523  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1524  std::swap(s1, s2);
1525  }
1526 
1527  // compute sub-ranges for each thread
1528  SegmentS s1s[nt], s2s[nt], sws[nt];
1529  s2s[0].i0 = s2.i0;
1530  s2s[nt - 1].i1 = s2.i1;
1531 
1532  // not sure parallel actually helps here
1533 #pragma omp parallel for num_threads(nt)
1534  for (int t = 0; t < nt; t++) {
1535  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1536  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1537 
1538  if (t + 1 < nt) {
1539  T pivot = src[s1s[t].i1];
1540  size_t i0 = s2.i0, i1 = s2.i1;
1541  while (i0 + 1 < i1) {
1542  size_t imed = (i1 + i0) / 2;
1543  if (comp (pivot, src[imed])) {i1 = imed; }
1544  else {i0 = imed; }
1545  }
1546  s2s[t].i1 = s2s[t + 1].i0 = i1;
1547  }
1548  }
1549  s1.i0 = std::min(s1.i0, s2.i0);
1550  s1.i1 = std::max(s1.i1, s2.i1);
1551  s2 = s1;
1552  sws[0].i0 = s1.i0;
1553  for (int t = 0; t < nt; t++) {
1554  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1555  if (t + 1 < nt) {
1556  sws[t + 1].i0 = sws[t].i1;
1557  }
1558  }
1559  assert(sws[nt - 1].i1 == s1.i1);
1560 
1561  // do the actual merging
1562 #pragma omp parallel for num_threads(nt)
1563  for (int t = 0; t < nt; t++) {
1564  SegmentS sw = sws[t];
1565  SegmentS s1t = s1s[t];
1566  SegmentS s2t = s2s[t];
1567  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1568  for (;;) {
1569  // assert (sw.len() == s1t.len() + s2t.len());
1570  if (comp(src[s1t.i0], src[s2t.i0])) {
1571  dst[sw.i0++] = src[s1t.i0++];
1572  if (s1t.i0 == s1t.i1) break;
1573  } else {
1574  dst[sw.i0++] = src[s2t.i0++];
1575  if (s2t.i0 == s2t.i1) break;
1576  }
1577  }
1578  }
1579  if (s1t.len() > 0) {
1580  assert(s1t.len() == sw.len());
1581  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1582  } else if (s2t.len() > 0) {
1583  assert(s2t.len() == sw.len());
1584  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1585  }
1586  }
1587  }
1588 
1589 };
1590 
1591 void fvec_argsort (size_t n, const float *vals,
1592  size_t *perm)
1593 {
1594  for (size_t i = 0; i < n; i++) perm[i] = i;
1595  ArgsortComparator comp = {vals};
1596  std::sort (perm, perm + n, comp);
1597 }
1598 
1599 void fvec_argsort_parallel (size_t n, const float *vals,
1600  size_t *perm)
1601 {
1602  size_t * perm2 = new size_t[n];
1603  // 2 result tables, during merging, flip between them
1604  size_t *permB = perm2, *permA = perm;
1605 
1606  int nt = omp_get_max_threads();
1607  { // prepare correct permutation so that the result ends in perm
1608  // at final iteration
1609  int nseg = nt;
1610  while (nseg > 1) {
1611  nseg = (nseg + 1) / 2;
1612  std::swap (permA, permB);
1613  }
1614  }
1615 
1616 #pragma omp parallel
1617  for (size_t i = 0; i < n; i++) permA[i] = i;
1618 
1619  ArgsortComparator comp = {vals};
1620 
1621  SegmentS segs[nt];
1622 
1623  // independent sorts
1624 #pragma omp parallel for
1625  for (int t = 0; t < nt; t++) {
1626  size_t i0 = t * n / nt;
1627  size_t i1 = (t + 1) * n / nt;
1628  SegmentS seg = {i0, i1};
1629  std::sort (permA + seg.i0, permA + seg.i1, comp);
1630  segs[t] = seg;
1631  }
1632  int prev_nested = omp_get_nested();
1633  omp_set_nested(1);
1634 
1635  int nseg = nt;
1636  while (nseg > 1) {
1637  int nseg1 = (nseg + 1) / 2;
1638  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1639  int sub_nseg1 = nseg / 2;
1640 
1641 #pragma omp parallel for num_threads(nseg1)
1642  for (int s = 0; s < nseg; s += 2) {
1643  if (s + 1 == nseg) { // otherwise isolated segment
1644  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1645  segs[s].len() * sizeof(size_t));
1646  } else {
1647  int t0 = s * sub_nt / sub_nseg1;
1648  int t1 = (s + 1) * sub_nt / sub_nseg1;
1649  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1650  parallel_merge(permA, permB, segs[s], segs[s + 1],
1651  t1 - t0, comp);
1652  }
1653  }
1654  for (int s = 0; s < nseg; s += 2)
1655  segs[s / 2] = segs[s];
1656  nseg = nseg1;
1657  std::swap (permA, permB);
1658  }
1659  assert (permA == perm);
1660  omp_set_nested(prev_nested);
1661  delete [] perm2;
1662 }
1663 
1664 
1665 
1666 
1667 
1668 
1669 
1670 
1671 
1672 
1673 
1674 
1675 
1676 
1677 
1678 
1679 /***************************************************************************
1680  * heavily optimized table computations
1681  ***************************************************************************/
1682 
1683 
1684 static inline void fvec_madd_ref (size_t n, const float *a,
1685  float bf, const float *b, float *c) {
1686  for (size_t i = 0; i < n; i++)
1687  c[i] = a[i] + bf * b[i];
1688 }
1689 
1690 
1691 static inline void fvec_madd_sse (size_t n, const float *a,
1692  float bf, const float *b, float *c) {
1693  n >>= 2;
1694  __m128 bf4 = _mm_set_ps1 (bf);
1695  __m128 * a4 = (__m128*)a;
1696  __m128 * b4 = (__m128*)b;
1697  __m128 * c4 = (__m128*)c;
1698 
1699  while (n--) {
1700  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1701  b4++;
1702  a4++;
1703  c4++;
1704  }
1705 }
1706 
1707 void fvec_madd (size_t n, const float *a,
1708  float bf, const float *b, float *c)
1709 {
1710  if ((n & 3) == 0 &&
1711  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1712  fvec_madd_sse (n, a, bf, b, c);
1713  else
1714  fvec_madd_ref (n, a, bf, b, c);
1715 }
1716 
1717 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1718  float bf, const float *b, float *c) {
1719  float vmin = 1e20;
1720  int imin = -1;
1721 
1722  for (size_t i = 0; i < n; i++) {
1723  c[i] = a[i] + bf * b[i];
1724  if (c[i] < vmin) {
1725  vmin = c[i];
1726  imin = i;
1727  }
1728  }
1729  return imin;
1730 }
1731 
1732 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1733  float bf, const float *b, float *c) {
1734  n >>= 2;
1735  __m128 bf4 = _mm_set_ps1 (bf);
1736  __m128 vmin4 = _mm_set_ps1 (1e20);
1737  __m128i imin4 = _mm_set1_epi32 (-1);
1738  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1739  __m128i inc4 = _mm_set1_epi32 (4);
1740  __m128 * a4 = (__m128*)a;
1741  __m128 * b4 = (__m128*)b;
1742  __m128 * c4 = (__m128*)c;
1743 
1744  while (n--) {
1745  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1746  *c4 = vc4;
1747  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1748  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1749 
1750  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1751  _mm_andnot_si128 (mask, imin4));
1752  vmin4 = _mm_min_ps (vmin4, vc4);
1753  b4++;
1754  a4++;
1755  c4++;
1756  idx4 = _mm_add_epi32 (idx4, inc4);
1757  }
1758 
1759  // 4 values -> 2
1760  {
1761  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1762  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1763  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1764  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1765  _mm_andnot_si128 (mask, imin4));
1766  vmin4 = _mm_min_ps (vmin4, vc4);
1767  }
1768  // 2 values -> 1
1769  {
1770  idx4 = _mm_shuffle_epi32 (imin4, 1);
1771  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1772  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1773  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1774  _mm_andnot_si128 (mask, imin4));
1775  // vmin4 = _mm_min_ps (vmin4, vc4);
1776  }
1777  return _mm_extract_epi32 (imin4, 0);
1778 }
1779 
1780 
1781 int fvec_madd_and_argmin (size_t n, const float *a,
1782  float bf, const float *b, float *c)
1783 {
1784  if ((n & 3) == 0 &&
1785  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1786  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1787  else
1788  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1789 }
1790 
1791 
1792 
1794  size_t d, size_t *n, size_t nmax, const float *x,
1795  bool verbose, long seed)
1796 {
1797 
1798  if (*n <= nmax) return x; // nothing to do
1799 
1800  size_t n2 = nmax;
1801  if (verbose) {
1802  printf (" Input training set too big (max size is %ld), sampling "
1803  "%ld / %ld vectors\n", nmax, n2, *n);
1804  }
1805  std::vector<int> subset (*n);
1806  rand_perm (subset.data (), *n, seed);
1807  float *x_subset = new float[n2 * d];
1808  for (long i = 0; i < n2; i++)
1809  memcpy (&x_subset[i * d],
1810  &x[subset[i] * size_t(d)],
1811  sizeof (x[0]) * d);
1812  *n = n2;
1813  return x_subset;
1814 }
1815 
1816 
1817 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:49
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:870
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
Definition: utils.cpp:1286
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:430
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1467
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1793
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1378
float rand_float()
between 0 and 1
Definition: utils.cpp:208
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1707
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1456
long rand_long()
random long &lt; 2 ^ 62
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1394
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1228
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1166
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:830
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:71
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1432
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:511
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1151
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1206
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1490
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1781
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:850