Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/matthijs/github_faiss/faiss/utils.cpp
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved
11 // -*- c++ -*-
12 
13 #include "utils.h"
14 
15 #include <cstdio>
16 #include <cassert>
17 #include <cstring>
18 #include <cmath>
19 
20 #include <smmintrin.h>
21 #include <mmintrin.h>
22 
23 #include <sys/time.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26 
27 #include <omp.h>
28 
29 
30 #include <algorithm>
31 #include <vector>
32 
33 #include "AuxIndexStructures.h"
34 #include "FaissAssert.h"
35 
36 
37 
38 #ifndef FINTEGER
39 #define FINTEGER long
40 #endif
41 
42 
43 extern "C" {
44 
45 /* declare BLAS functions, see http://www.netlib.org/clapack/cblas/ */
46 
47 int sgemm_ (const char *transa, const char *transb, FINTEGER *m, FINTEGER *
48  n, FINTEGER *k, const float *alpha, const float *a,
49  FINTEGER *lda, const float *b, FINTEGER *
50  ldb, float *beta, float *c, FINTEGER *ldc);
51 
52 /* Lapack functions, see http://www.netlib.org/clapack/old/single/sgeqrf.c */
53 
54 int sgeqrf_ (FINTEGER *m, FINTEGER *n, float *a, FINTEGER *lda,
55  float *tau, float *work, FINTEGER *lwork, FINTEGER *info);
56 
57 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k, float *a,
58  FINTEGER *lda, float *tau, float *work,
59  FINTEGER *lwork, FINTEGER *info);
60 
61 
62 }
63 
64 
65 /**************************************************
66  * Get some stats about the system
67  **************************************************/
68 
69 namespace faiss {
70 
71 double getmillisecs () {
72  struct timeval tv;
73  gettimeofday (&tv, nullptr);
74  return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
75 }
76 
77 
78 #ifdef __linux__
79 
80 size_t get_mem_usage_kb ()
81 {
82  int pid = getpid ();
83  char fname[256];
84  snprintf (fname, 256, "/proc/%d/status", pid);
85  FILE * f = fopen (fname, "r");
86  FAISS_ASSERT (f || !"cannot open proc status file") ;
87  size_t sz = 0;
88  for (;;) {
89  char buf [256];
90  if (!fgets (buf, 256, f)) break;
91  if (sscanf (buf, "VmRSS: %ld kB", &sz) == 1) break;
92  }
93  fclose (f);
94  return sz;
95 }
96 
97 #elif __APPLE__
98 
99 size_t get_mem_usage_kb ()
100 {
101  fprintf(stderr, "WARN: get_mem_usage_kb not implemented on the mac\n");
102  return 0;
103 }
104 
105 #endif
106 
107 
108 
109 /**************************************************
110  * Random data generation functions
111  **************************************************/
112 
113 /**
114  * The definition of random functions depends on the architecture:
115  *
116  * - for Linux, we rely on re-entrant functions (random_r). This
117  * provides good quality reproducible random sequences.
118  *
119  * - for Apple, we use rand_r. Apple is trying so hard to deprecate
120  * this function that it removed its definition form stdlib.h, so we
121  * re-declare it below. Fortunately, since it is deprecated, its
122  * prototype should not change much in the forerseeable future.
123  *
124  * Unfortunately, system designers are more concerned with making the
125  * most unpredictable random sequences for cryptographic use, when in
126  * scientific contexts what acutally matters is having reproducible
127  * squences in multi-threaded contexts.
128  */
129 
130 
131 #ifdef __linux__
132 
133 
134 
135 
137 {
138  int32_t a;
139  random_r (&rand_data, &a);
140  return a;
141 }
142 
144 {
145  int32_t a, b;
146  random_r (&rand_data, &a);
147  random_r (&rand_data, &b);
148  return long(a) | long(b) << 31;
149 }
150 
151 
153 {
154  memset (&rand_data, 0, sizeof (rand_data));
155  initstate_r (seed, rand_state, sizeof (rand_state), &rand_data);
156 }
157 
158 
159 RandomGenerator::RandomGenerator (const RandomGenerator & other)
160 {
161  memcpy (rand_state, other.rand_state, sizeof(rand_state));
162  rand_data = other.rand_data;
163  setstate_r (rand_state, &rand_data);
164 }
165 
166 
167 #elif __APPLE__
168 
169 extern "C" {
170 int rand_r(unsigned *seed);
171 }
172 
174 {
175  rand_state = seed;
176 }
177 
178 
179 RandomGenerator::RandomGenerator (const RandomGenerator & other)
180 {
181  rand_state = other.rand_state;
182 }
183 
184 
186 {
187  // RAND_MAX is 31 bits
188  // try to add more randomness in the lower bits
189  int lowbits = rand_r(&rand_state) >> 15;
190  return rand_r(&rand_state) ^ lowbits;
191 }
192 
194 {
195  return long(random()) | long(random()) << 31;
196 }
197 
198 
199 
200 #endif
201 
203 { // this suffers form non-uniform probabilities when max is not a
204  // power of 2, but if RAND_MAX >> max the bias is limited.
205  return rand_int () % max;
206 }
207 
209 {
210  return rand_int() / float(1L << 31);
211 }
212 
213 double RandomGenerator::rand_double ()
214 {
215  return rand_long() / double(1L << 62);
216 }
217 
218 
219 /***********************************************************************
220  * Random functions in this C file only exist because Torch
221  * counterparts are slow and not multi-threaded. Typical use is for
222  * more than 1-100 billion values. */
223 
224 
225 /* Generate a set of random floating point values such that x[i] in [0,1]
226  multi-threading. For this reason, we rely on re-entreant functions. */
227 void float_rand (float * x, size_t n, long seed)
228 {
229  // only try to parallelize on large enough arrays
230  const size_t nblock = n < 1024 ? 1 : 1024;
231 
232  RandomGenerator rng0 (seed);
233  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
234 
235 #pragma omp parallel for
236  for (size_t j = 0; j < nblock; j++) {
237 
238  RandomGenerator rng (a0 + j * b0);
239 
240  const size_t istart = j * n / nblock;
241  const size_t iend = (j + 1) * n / nblock;
242 
243  for (size_t i = istart; i < iend; i++)
244  x[i] = rng.rand_float ();
245  }
246 }
247 
248 
249 void float_randn (float * x, size_t n, long seed)
250 {
251  // only try to parallelize on large enough arrays
252  const size_t nblock = n < 1024 ? 1 : 1024;
253 
254  RandomGenerator rng0 (seed);
255  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
256 
257 #pragma omp parallel for
258  for (size_t j = 0; j < nblock; j++) {
259  RandomGenerator rng (a0 + j * b0);
260 
261  double a = 0, b = 0, s = 0;
262  int state = 0; /* generate two number per "do-while" loop */
263 
264  const size_t istart = j * n / nblock;
265  const size_t iend = (j + 1) * n / nblock;
266 
267  for (size_t i = istart; i < iend; i++) {
268  /* Marsaglia's method (see Knuth) */
269  if (state == 0) {
270  do {
271  a = 2.0 * rng.rand_double () - 1;
272  b = 2.0 * rng.rand_double () - 1;
273  s = a * a + b * b;
274  } while (s >= 1.0);
275  x[i] = a * sqrt(-2.0 * log(s) / s);
276  }
277  else
278  x[i] = b * sqrt(-2.0 * log(s) / s);
279  state = 1 - state;
280  }
281  }
282 }
283 
284 
285 /* Integer versions */
286 void long_rand (long * x, size_t n, long seed)
287 {
288  // only try to parallelize on large enough arrays
289  const size_t nblock = n < 1024 ? 1 : 1024;
290 
291  RandomGenerator rng0 (seed);
292  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
293 
294 #pragma omp parallel for
295  for (size_t j = 0; j < nblock; j++) {
296 
297  RandomGenerator rng (a0 + j * b0);
298 
299  const size_t istart = j * n / nblock;
300  const size_t iend = (j + 1) * n / nblock;
301  for (size_t i = istart; i < iend; i++)
302  x[i] = rng.rand_long ();
303  }
304 }
305 
306 
307 
308 void rand_perm (int *perm, size_t n, long seed)
309 {
310  for (size_t i = 0; i < n; i++) perm[i] = i;
311 
312  RandomGenerator rng (seed);
313 
314  for (size_t i = 0; i + 1 < n; i++) {
315  int i2 = i + rng.rand_int (n - i);
316  std::swap(perm[i], perm[i2]);
317  }
318 }
319 
320 
321 
322 
323 void byte_rand (uint8_t * x, size_t n, long seed)
324 {
325  // only try to parallelize on large enough arrays
326  const size_t nblock = n < 1024 ? 1 : 1024;
327 
328  RandomGenerator rng0 (seed);
329  int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
330 
331 #pragma omp parallel for
332  for (size_t j = 0; j < nblock; j++) {
333 
334  RandomGenerator rng (a0 + j * b0);
335 
336  const size_t istart = j * n / nblock;
337  const size_t iend = (j + 1) * n / nblock;
338 
339  size_t i;
340  for (i = istart; i < iend; i++)
341  x[i] = rng.rand_long ();
342  }
343 }
344 
345 
346 
347 void reflection (const float * __restrict u,
348  float * __restrict x,
349  size_t n, size_t d, size_t nu)
350 {
351  size_t i, j, l;
352  for (i = 0; i < n; i++) {
353  const float * up = u;
354  for (l = 0; l < nu; l++) {
355  float ip1 = 0, ip2 = 0;
356 
357  for (j = 0; j < d; j+=2) {
358  ip1 += up[j] * x[j];
359  ip2 += up[j+1] * x[j+1];
360  }
361  float ip = 2 * (ip1 + ip2);
362 
363  for (j = 0; j < d; j++)
364  x[j] -= ip * up[j];
365  up += d;
366  }
367  x += d;
368  }
369 }
370 
371 
372 /* Reference implementation (slower) */
373 void reflection_ref (const float * u, float * x, size_t n, size_t d, size_t nu)
374 {
375  size_t i, j, l;
376  for (i = 0; i < n; i++) {
377  const float * up = u;
378  for (l = 0; l < nu; l++) {
379  double ip = 0;
380 
381  for (j = 0; j < d; j++)
382  ip += up[j] * x[j];
383  ip *= 2;
384 
385  for (j = 0; j < d; j++)
386  x[j] -= ip * up[j];
387 
388  up += d;
389  }
390  x += d;
391  }
392 }
393 
394 /*********************************************************
395  * Optimized distance computations
396  *********************************************************/
397 
398 
399 
400 /* Functions to compute:
401  - L2 distance between 2 vectors
402  - inner product between 2 vectors
403  - L2 norm of a vector
404 
405  The functions should probably not be invoked when a large number of
406  vectors are be processed in batch (in which case Matrix multiply
407  is faster), but may be useful for comparing vectors isolated in
408  memory.
409 
410  Works with any vectors of any dimension, even unaligned (in which
411  case they are slower).
412 
413 */
414 
415 // reads 0 <= d < 4 floats as __m128
416 static inline __m128 masked_read (int d, const float *x)
417 {
418  assert (0 <= d && d < 4);
419  __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
420  switch (d) {
421  case 3:
422  buf[2] = x[2];
423  case 2:
424  buf[1] = x[1];
425  case 1:
426  buf[0] = x[0];
427  }
428  return _mm_load_ps (buf);
429 }
430 
431 /* SSE-implementation of L2 distance */
432 float fvec_L2sqr (const float * x,
433  const float * y,
434  size_t d)
435 {
436  __m128 mx, my;
437  __m128 msum1 = _mm_setzero_ps();
438 
439  while (d >= 4) {
440  mx = _mm_loadu_ps (x); x += 4;
441  my = _mm_loadu_ps (y); y += 4;
442  const __m128 a_m_b1 = _mm_sub_ps (mx, my);
443  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
444  d -= 4;
445  }
446 
447  // add the last 1, 2 or 3 values
448  mx = masked_read (d, x);
449  my = masked_read (d, y);
450  __m128 a_m_b1 = _mm_sub_ps (mx, my);
451 
452  msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
453 
454  msum1 = _mm_hadd_ps (msum1, msum1);
455  msum1 = _mm_hadd_ps (msum1, msum1);
456  return _mm_cvtss_f32 (msum1);
457 }
458 
459 
460 /* same without SSE */
461 float fvec_L2sqr_ref (const float * x,
462  const float * y,
463  size_t d)
464 {
465  size_t i;
466  float res_ = 0;
467  for (i = 0; i < d; i++) {
468  const float tmp = x[i] - y[i];
469  res_ += tmp * tmp;
470  }
471  return res_;
472 }
473 
474 float fvec_inner_product (const float * x,
475  const float * y,
476  size_t d)
477 {
478  __m128 mx, my;
479  __m128 msum1 = _mm_setzero_ps();
480 
481  while (d >= 4) {
482  mx = _mm_loadu_ps (x); x += 4;
483  my = _mm_loadu_ps (y); y += 4;
484  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
485  d -= 4;
486  }
487 
488  // add the last 1, 2, or 3 values
489  mx = masked_read (d, x);
490  my = masked_read (d, y);
491  __m128 prod = _mm_mul_ps (mx, my);
492 
493  msum1 = _mm_add_ps (msum1, prod);
494 
495  msum1 = _mm_hadd_ps (msum1, msum1);
496  msum1 = _mm_hadd_ps (msum1, msum1);
497  return _mm_cvtss_f32 (msum1);
498 }
499 
500 
501 float fvec_inner_product_ref (const float * x,
502  const float * y,
503  size_t d)
504 {
505  size_t i;
506  float res_ = 0;
507  for (i = 0; i < d; i++)
508  res_ += x[i] * y[i];
509  return res_;
510 }
511 
512 
513 float fvec_norm_L2sqr (const float * x,
514  size_t d)
515 {
516  __m128 mx;
517  __m128 msum1 = _mm_setzero_ps();
518 
519  while (d >= 4) {
520  mx = _mm_loadu_ps (x); x += 4;
521  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
522  d -= 4;
523  }
524 
525  mx = masked_read (d, x);
526  msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
527 
528  msum1 = _mm_hadd_ps (msum1, msum1);
529  msum1 = _mm_hadd_ps (msum1, msum1);
530  return _mm_cvtss_f32 (msum1);
531 }
532 
533 
534 float fvec_norm_L2sqr_ref (const float * __restrict x,
535  size_t d)
536 {
537  size_t i;
538  double res_ = 0;
539  for (i = 0; i < d; i++)
540  res_ += x[i] * x[i];
541  return res_;
542 }
543 
544 
545 
546 
547 /* Compute the inner product between a vector x and
548  a set of ny vectors y.
549  These functions are not intended to replace BLAS matrix-matrix, as they
550  would be significantly less efficient in this case. */
551 void fvec_inner_products_ny (float * __restrict ip,
552  const float * x,
553  const float * y,
554  size_t d, size_t ny)
555 {
556  for (size_t i = 0; i < ny; i++) {
557  ip[i] = fvec_inner_product (x, y, d);
558  y += d;
559  }
560 }
561 
562 
563 
564 
565 /* compute ny L2 distances between x and a set of vectors y */
566 void fvec_L2sqr_ny (float * __restrict dis,
567  const float * x,
568  const float * y,
569  size_t d, size_t ny)
570 {
571  for (size_t i = 0; i < ny; i++) {
572  dis[i] = fvec_L2sqr (x, y, d);
573  y += d;
574  }
575 }
576 
577 
578 
579 
580 /* Compute the L2 norm of a set of nx vectors */
581 void fvec_norms_L2 (float * __restrict nr,
582  const float * __restrict x,
583  size_t d, size_t nx)
584 {
585 
586 #pragma omp parallel for
587  for (size_t i = 0; i < nx; i++) {
588  nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
589  }
590 }
591 
592 void fvec_norms_L2sqr (float * __restrict nr,
593  const float * __restrict x,
594  size_t d, size_t nx)
595 {
596 #pragma omp parallel for
597  for (size_t i = 0; i < nx; i++)
598  nr[i] = fvec_norm_L2sqr (x + i * d, d);
599 }
600 
601 
602 
603 void fvec_renorm_L2 (size_t d, size_t nx, float * __restrict x)
604 {
605 #pragma omp parallel for
606  for (size_t i = 0; i < nx; i++) {
607  float * __restrict xi = x + i * d;
608 
609  float nr = fvec_norm_L2sqr (xi, d);
610 
611  if (nr > 0) {
612  size_t j;
613  const float inv_nr = 1.0 / sqrtf (nr);
614  for (j = 0; j < d; j++)
615  xi[j] *= inv_nr;
616  }
617  }
618 }
619 
620 
621 
622 
623 
624 
625 
626 
627 
628 
629 
630 
631 
632 
633 
634 
635 
636 /***************************************************************************
637  * KNN functions
638  ***************************************************************************/
639 
640 
641 
642 /* Find the nearest neighbors for nx queries in a set of ny vectors */
643 static void knn_inner_product_sse (const float * x,
644  const float * y,
645  size_t d, size_t nx, size_t ny,
646  float_minheap_array_t * res)
647 {
648  size_t k = res->k;
649 
650 #pragma omp parallel for
651  for (size_t i = 0; i < nx; i++) {
652  const float * x_ = x + i * d;
653  const float * y_ = y;
654 
655  float * __restrict simi = res->get_val(i);
656  long * __restrict idxi = res->get_ids (i);
657 
658  minheap_heapify (k, simi, idxi);
659 
660  for (size_t j = 0; j < ny; j++) {
661  float ip = fvec_inner_product (x_, y_, d);
662 
663  if (ip > simi[0]) {
664  minheap_pop (k, simi, idxi);
665  minheap_push (k, simi, idxi, ip, j);
666  }
667  y_ += d;
668  }
669  minheap_reorder (k, simi, idxi);
670  }
671 
672 }
673 
674 static void knn_L2sqr_sse (
675  const float * x,
676  const float * y,
677  size_t d, size_t nx, size_t ny,
678  float_maxheap_array_t * res)
679 {
680  size_t k = res->k;
681 
682 #pragma omp parallel for
683  for (size_t i = 0; i < nx; i++) {
684  const float * x_ = x + i * d;
685  const float * y_ = y;
686  size_t j;
687  float * __restrict simi = res->get_val(i);
688  long * __restrict idxi = res->get_ids (i);
689 
690  maxheap_heapify (k, simi, idxi);
691  for (j = 0; j < ny; j++) {
692  float disij = fvec_L2sqr (x_, y_, d);
693 
694  if (disij < simi[0]) {
695  maxheap_pop (k, simi, idxi);
696  maxheap_push (k, simi, idxi, disij, j);
697  }
698  y_ += d;
699  }
700  maxheap_reorder (k, simi, idxi);
701  }
702 
703 }
704 
705 
706 /** Find the nearest neighbors for nx queries in a set of ny vectors */
707 static void knn_inner_product_blas (
708  const float * x,
709  const float * y,
710  size_t d, size_t nx, size_t ny,
711  float_minheap_array_t * res)
712 {
713  res->heapify ();
714 
715  // BLAS does not like empty matrices
716  if (nx == 0 || ny == 0) return;
717 
718  /* block sizes */
719  const size_t bs_x = 4096, bs_y = 1024;
720  // const size_t bs_x = 16, bs_y = 16;
721  float *ip_block = new float[bs_x * bs_y];
722 
723  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
724  size_t i1 = i0 + bs_x;
725  if(i1 > nx) i1 = nx;
726 
727  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
728  size_t j1 = j0 + bs_y;
729  if (j1 > ny) j1 = ny;
730  /* compute the actual dot products */
731  {
732  float one = 1, zero = 0;
733  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
734  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
735  y + j0 * d, &di,
736  x + i0 * d, &di, &zero,
737  ip_block, &nyi);
738  }
739 
740  /* collect maxima */
741  res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
742  }
743  }
744  delete [] ip_block;
745  res->reorder ();
746 }
747 
748 // distance correction is an operator that can be applied to transform
749 // the distances
750 template<class DistanceCorrection>
751 static void knn_L2sqr_blas (const float * x,
752  const float * y,
753  size_t d, size_t nx, size_t ny,
754  float_maxheap_array_t * res,
755  const DistanceCorrection &corr)
756 {
757  res->heapify ();
758 
759  // BLAS does not like empty matrices
760  if (nx == 0 || ny == 0) return;
761 
762  size_t k = res->k;
763 
764  /* block sizes */
765  const size_t bs_x = 4096, bs_y = 1024;
766  // const size_t bs_x = 16, bs_y = 16;
767  float *ip_block = new float[bs_x * bs_y];
768 
769  float *x_norms = new float[nx];
770  fvec_norms_L2sqr (x_norms, x, d, nx);
771 
772  float *y_norms = new float[ny];
773  fvec_norms_L2sqr (y_norms, y, d, ny);
774 
775  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
776  size_t i1 = i0 + bs_x;
777  if(i1 > nx) i1 = nx;
778 
779  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
780  size_t j1 = j0 + bs_y;
781  if (j1 > ny) j1 = ny;
782  /* compute the actual dot products */
783  {
784  float one = 1, zero = 0;
785  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
786  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
787  y + j0 * d, &di,
788  x + i0 * d, &di, &zero,
789  ip_block, &nyi);
790  }
791 
792  /* collect minima */
793 #pragma omp parallel for
794  for (size_t i = i0; i < i1; i++) {
795  float * __restrict simi = res->get_val(i);
796  long * __restrict idxi = res->get_ids (i);
797  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
798 
799  for (size_t j = j0; j < j1; j++) {
800  float ip = *ip_line++;
801  float dis = x_norms[i] + y_norms[j] - 2 * ip;
802 
803  dis = corr (dis, i, j);
804 
805  if (dis < simi[0]) {
806  maxheap_pop (k, simi, idxi);
807  maxheap_push (k, simi, idxi, dis, j);
808  }
809  }
810  }
811  }
812  }
813  res->reorder ();
814 
815  delete [] ip_block;
816  delete [] x_norms;
817  delete [] y_norms;
818 }
819 
820 
821 
822 
823 
824 
825 
826 
827 
828 /*******************************************************
829  * KNN driver functions
830  *******************************************************/
831 
832 void knn_inner_product (const float * x,
833  const float * y,
834  size_t d, size_t nx, size_t ny,
835  float_minheap_array_t * res)
836 {
837  if (d % 4 == 0 && nx < 20) {
838  knn_inner_product_sse (x, y, d, nx, ny, res);
839  } else {
840  knn_inner_product_blas (x, y, d, nx, ny, res);
841  }
842 }
843 
844 
845 
847  float operator()(float dis, size_t qno, size_t bno) const {
848  return dis;
849  }
850 };
851 
852 void knn_L2sqr (const float * x,
853  const float * y,
854  size_t d, size_t nx, size_t ny,
855  float_maxheap_array_t * res)
856 {
857  if (d % 4 == 0 && nx < 20) {
858  knn_L2sqr_sse (x, y, d, nx, ny, res);
859  } else {
861  knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
862  }
863 }
864 
866  const float *base_shift;
867  float operator()(float dis, size_t qno, size_t bno) const {
868  return dis - base_shift[bno];
869  }
870 };
871 
873  const float * x,
874  const float * y,
875  size_t d, size_t nx, size_t ny,
876  float_maxheap_array_t * res,
877  const float *base_shift)
878 {
879  BaseShiftDistanceCorrection corr = {base_shift};
880  knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
881 }
882 
883 
884 
885 /***************************************************************************
886  * compute a subset of distances
887  ***************************************************************************/
888 
889 /* compute the inner product between x and a subset y of ny vectors,
890  whose indices are given by idy. */
891 void fvec_inner_products_by_idx (float * __restrict ip,
892  const float * x,
893  const float * y,
894  const long * __restrict ids, /* for y vecs */
895  size_t d, size_t nx, size_t ny)
896 {
897 #pragma omp parallel for
898  for (size_t j = 0; j < nx; j++) {
899  const long * __restrict idsj = ids + j * ny;
900  const float * xj = x + j * d;
901  float * __restrict ipj = ip + j * ny;
902  for (size_t i = 0; i < ny; i++) {
903  if (idsj[i] < 0)
904  continue;
905  ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
906  }
907  }
908 }
909 
910 /* compute the inner product between x and a subset y of ny vectors,
911  whose indices are given by idy. */
912 void fvec_L2sqr_by_idx (float * __restrict dis,
913  const float * x,
914  const float * y,
915  const long * __restrict ids, /* ids of y vecs */
916  size_t d, size_t nx, size_t ny)
917 {
918 #pragma omp parallel for
919  for (size_t j = 0; j < nx; j++) {
920  const long * __restrict idsj = ids + j * ny;
921  const float * xj = x + j * d;
922  float * __restrict disj = dis + j * ny;
923  for (size_t i = 0; i < ny; i++) {
924  if (idsj[i] < 0)
925  continue;
926  disj[i] = fvec_L2sqr (xj, y + d * idsj[i], d);
927  }
928  }
929 }
930 
931 
932 
933 
934 
935 /* Find the nearest neighbors for nx queries in a set of ny vectors
936  indexed by ids. May be useful for re-ranking a pre-selected vector list */
937 void knn_inner_products_by_idx (const float * x,
938  const float * y,
939  const long * ids,
940  size_t d, size_t nx, size_t ny,
941  float_minheap_array_t * res)
942 {
943  size_t k = res->k;
944 
945 #pragma omp parallel for
946  for (size_t i = 0; i < nx; i++) {
947  const float * x_ = x + i * d;
948  const long * idsi = ids + i * ny;
949  size_t j;
950  float * __restrict simi = res->get_val(i);
951  long * __restrict idxi = res->get_ids (i);
952  minheap_heapify (k, simi, idxi);
953 
954  for (j = 0; j < ny; j++) {
955  if (idsi[j] < 0) break;
956  float ip = fvec_inner_product (x_, y + d * idsi[j], d);
957 
958  if (ip > simi[0]) {
959  minheap_pop (k, simi, idxi);
960  minheap_push (k, simi, idxi, ip, idsi[j]);
961  }
962  }
963  minheap_reorder (k, simi, idxi);
964  }
965 
966 }
967 
968 void knn_L2sqr_by_idx (const float * x,
969  const float * y,
970  const long * __restrict ids,
971  size_t d, size_t nx, size_t ny,
972  float_maxheap_array_t * res)
973 {
974  size_t k = res->k;
975 
976 #pragma omp parallel for
977  for (size_t i = 0; i < nx; i++) {
978  const float * x_ = x + i * d;
979  const long * __restrict idsi = ids + i * ny;
980  float * __restrict simi = res->get_val(i);
981  long * __restrict idxi = res->get_ids (i);
982  maxheap_heapify (res->k, simi, idxi);
983  for (size_t j = 0; j < ny; j++) {
984  float disij = fvec_L2sqr (x_, y + d * idsi[j], d);
985 
986  if (disij < simi[0]) {
987  maxheap_pop (k, simi, idxi);
988  maxheap_push (k, simi, idxi, disij, idsi[j]);
989  }
990  }
991  maxheap_reorder (res->k, simi, idxi);
992  }
993 
994 }
995 
996 
997 
998 
999 
1000 /***************************************************************************
1001  * Range search
1002  ***************************************************************************/
1003 
1004 /** Find the nearest neighbors for nx queries in a set of ny vectors
1005  * compute_l2 = compute pairwise squared L2 distance rather than inner prod
1006  */
1007 template <bool compute_l2>
1008 static void range_search_blas (
1009  const float * x,
1010  const float * y,
1011  size_t d, size_t nx, size_t ny,
1012  float radius,
1013  RangeSearchResult *result)
1014 {
1015 
1016  // BLAS does not like empty matrices
1017  if (nx == 0 || ny == 0) return;
1018 
1019  /* block sizes */
1020  const size_t bs_x = 4096, bs_y = 1024;
1021  // const size_t bs_x = 16, bs_y = 16;
1022  float *ip_block = new float[bs_x * bs_y];
1023 
1024  float *x_norms = nullptr, *y_norms = nullptr;
1025 
1026  if (compute_l2) {
1027  x_norms = new float[nx];
1028  fvec_norms_L2sqr (x_norms, x, d, nx);
1029  y_norms = new float[ny];
1030  fvec_norms_L2sqr (y_norms, y, d, ny);
1031  }
1032 
1033  std::vector <RangeSearchPartialResult *> partial_results;
1034 
1035  for (size_t j0 = 0; j0 < ny; j0 += bs_y) {
1036  size_t j1 = j0 + bs_y;
1037  if (j1 > ny) j1 = ny;
1038  RangeSearchPartialResult * pres = new RangeSearchPartialResult (result);
1039  partial_results.push_back (pres);
1040 
1041  for (size_t i0 = 0; i0 < nx; i0 += bs_x) {
1042  size_t i1 = i0 + bs_x;
1043  if(i1 > nx) i1 = nx;
1044 
1045  /* compute the actual dot products */
1046  {
1047  float one = 1, zero = 0;
1048  FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1049  sgemm_ ("Transpose", "Not transpose", &nyi, &nxi, &di, &one,
1050  y + j0 * d, &di,
1051  x + i0 * d, &di, &zero,
1052  ip_block, &nyi);
1053  }
1054 
1055 
1056  for (size_t i = i0; i < i1; i++) {
1057  const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1058 
1059  RangeSearchPartialResult::QueryResult & qres =
1060  pres->new_result (i);
1061 
1062  for (size_t j = j0; j < j1; j++) {
1063  float ip = *ip_line++;
1064  if (compute_l2) {
1065  float dis = x_norms[i] + y_norms[j] - 2 * ip;
1066  if (dis < radius) {
1067  qres.add (dis, j);
1068  }
1069  } else {
1070  if (ip > radius) {
1071  qres.add (ip, j);
1072  }
1073  }
1074  }
1075  }
1076  }
1077 
1078  }
1079  delete [] ip_block;
1080  delete [] x_norms;
1081  delete [] y_norms;
1082 
1083  { // merge the partial results
1084  int npres = partial_results.size();
1085  // count
1086  for (size_t i = 0; i < nx; i++) {
1087  for (int j = 0; j < npres; j++)
1088  result->lims[i] += partial_results[j]->queries[i].nres;
1089  }
1090  result->do_allocation ();
1091  for (int j = 0; j < npres; j++) {
1092  partial_results[j]->set_result (true);
1093  delete partial_results[j];
1094  }
1095 
1096  // reset the limits
1097  for (size_t i = nx; i > 0; i--) {
1098  result->lims [i] = result->lims [i - 1];
1099  }
1100  result->lims [0] = 0;
1101  }
1102 }
1103 
1104 
1105 template <bool compute_l2>
1106 static void range_search_sse (const float * x,
1107  const float * y,
1108  size_t d, size_t nx, size_t ny,
1109  float radius,
1110  RangeSearchResult *res)
1111 {
1112  FAISS_ASSERT (d % 4 == 0);
1113 
1114 #pragma omp parallel
1115  {
1116  RangeSearchPartialResult pres (res);
1117 
1118 #pragma omp for
1119  for (size_t i = 0; i < nx; i++) {
1120  const float * x_ = x + i * d;
1121  const float * y_ = y;
1122  size_t j;
1123 
1124  RangeSearchPartialResult::QueryResult & qres =
1125  pres.new_result (i);
1126 
1127  for (j = 0; j < ny; j++) {
1128  if (compute_l2) {
1129  float disij = fvec_L2sqr (x_, y_, d);
1130  if (disij < radius) {
1131  qres.add (disij, j);
1132  }
1133  } else {
1134  float ip = fvec_inner_product (x_, y_, d);
1135  if (ip > radius) {
1136  qres.add (ip, j);
1137  }
1138  }
1139  y_ += d;
1140  }
1141 
1142  }
1143  pres.finalize ();
1144  }
1145 }
1146 
1147 
1148 
1149 
1150 
1152  const float * x,
1153  const float * y,
1154  size_t d, size_t nx, size_t ny,
1155  float radius,
1156  RangeSearchResult *res)
1157 {
1158 
1159  if (d % 4 == 0 && nx < 20) {
1160  range_search_sse<true> (x, y, d, nx, ny, radius, res);
1161  } else {
1162  range_search_blas<true> (x, y, d, nx, ny, radius, res);
1163  }
1164 }
1165 
1167  const float * x,
1168  const float * y,
1169  size_t d, size_t nx, size_t ny,
1170  float radius,
1171  RangeSearchResult *res)
1172 {
1173 
1174  if (d % 4 == 0 && nx < 20) {
1175  range_search_sse<false> (x, y, d, nx, ny, radius, res);
1176  } else {
1177  range_search_blas<false> (x, y, d, nx, ny, radius, res);
1178  }
1179 }
1180 
1181 
1182 
1183 /***************************************************************************
1184  * Some matrix manipulation functions
1185  ***************************************************************************/
1186 
1187 
1188 /* This function exists because the Torch counterpart is extremly slow
1189  (not multi-threaded + unexpected overhead even in single thread).
1190  It is here to implement the usual property |x-y|^2=|x|^2+|y|^2-2<x|y> */
1191 void inner_product_to_L2sqr (float * __restrict dis,
1192  const float * nr1,
1193  const float * nr2,
1194  size_t n1, size_t n2)
1195 {
1196 
1197 #pragma omp parallel for
1198  for (size_t j = 0 ; j < n1 ; j++) {
1199  float * disj = dis + j * n2;
1200  for (size_t i = 0 ; i < n2 ; i++)
1201  disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1202  }
1203 }
1204 
1205 
1206 void matrix_qr (int m, int n, float *a)
1207 {
1208  FAISS_ASSERT(m >= n);
1209  FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1210  std::vector<float> tau (ki);
1211  FINTEGER lwork = -1, info;
1212  float work_size;
1213 
1214  sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1215  &work_size, &lwork, &info);
1216  lwork = size_t(work_size);
1217  std::vector<float> work (lwork);
1218 
1219  sgeqrf_ (&mi, &ni, a, &mi,
1220  tau.data(), work.data(), &lwork, &info);
1221 
1222  sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1223  work.data(), &lwork, &info);
1224 
1225 }
1226 
1227 
1228 void pairwise_L2sqr (long d,
1229  long nq, const float *xq,
1230  long nb, const float *xb,
1231  float *dis,
1232  long ldq, long ldb, long ldd)
1233 {
1234  if (nq == 0 || nb == 0) return;
1235  if (ldq == -1) ldq = d;
1236  if (ldb == -1) ldb = d;
1237  if (ldd == -1) ldd = nb;
1238 
1239  // store in beginning of distance matrix to avoid malloc
1240  float *b_norms = dis;
1241 
1242 #pragma omp parallel for
1243  for (long i = 0; i < nb; i++)
1244  b_norms [i] = fvec_norm_L2sqr (xb + i * ldb, d);
1245 
1246 #pragma omp parallel for
1247  for (long i = 1; i < nq; i++) {
1248  float q_norm = fvec_norm_L2sqr (xq + i * ldq, d);
1249  for (long j = 0; j < nb; j++)
1250  dis[i * ldd + j] = q_norm + b_norms [j];
1251  }
1252 
1253  {
1254  float q_norm = fvec_norm_L2sqr (xq, d);
1255  for (long j = 0; j < nb; j++)
1256  dis[j] += q_norm;
1257  }
1258 
1259  {
1260  FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1261  float one = 1.0, minus_2 = -2.0;
1262 
1263  sgemm_ ("Transposed", "Not transposed",
1264  &nbi, &nqi, &di,
1265  &minus_2,
1266  xb, &ldbi,
1267  xq, &ldqi,
1268  &one, dis, &lddi);
1269  }
1270 
1271 
1272 }
1273 
1274 
1275 
1276 /***************************************************************************
1277  * Kmeans subroutine
1278  ***************************************************************************/
1279 
1280 // a bit above machine epsilon for float16
1281 
1282 #define EPS (1 / 1024.)
1283 
1284 /* For k-means, compute centroids given assignment of vectors to centroids */
1285 /* NOTE: This could be multi-threaded (use histogram of indexes) */
1286 int km_update_centroids (const float * x,
1287  float * centroids,
1288  long * assign,
1289  size_t d, size_t k, size_t n)
1290 {
1291  std::vector<size_t> hassign(k);
1292  memset (centroids, 0, sizeof(*centroids) * d * k);
1293 
1294 
1295 #pragma omp parallel
1296  {
1297  int nt = omp_get_num_threads();
1298  int rank = omp_get_thread_num();
1299  // this thread is taking care of centroids c0:c1
1300  size_t c0 = (k * rank) / nt;
1301  size_t c1 = (k * (rank + 1)) / nt;
1302  const float *xi = x;
1303  // printf("thread %d/%d: centroids %ld:%ld\n", rank, nt, c0, c1);
1304  size_t nacc = 0;
1305 
1306  for (size_t i = 0; i < n; i++) {
1307  long ci = assign[i];
1308  assert (ci >= 0 && ci < k);
1309  if (ci >= c0 && ci < c1) {
1310  float * c = centroids + ci * d;
1311  hassign[ci]++;
1312  for (size_t j = 0; j < d; j++)
1313  c[j] += xi[j];
1314  nacc++;
1315  }
1316  xi += d;
1317  }
1318  // printf("thread %d/%d: nacc = %ld/%ld\n", rank, nt, nacc, n);
1319 
1320  }
1321 
1322 #pragma omp parallel for
1323  for (size_t ci = 0; ci < k; ci++) {
1324  float * c = centroids + ci * d;
1325  float ni = (float) hassign[ci];
1326  if (ni != 0) {
1327  for (size_t j = 0; j < d; j++)
1328  c[j] /= ni;
1329  }
1330  }
1331 
1332  /* Take care of void clusters */
1333  size_t nsplit = 0;
1334  RandomGenerator rng (1234);
1335  for (size_t ci = 0; ci < k; ci++) {
1336  if (hassign[ci] == 0) { /* need to redefine a centroid */
1337  size_t cj;
1338  for (cj = 0; 1; cj = (cj+1) % k) {
1339  /* probability to pick this cluster for split */
1340  float p = (hassign[cj] - 1.0) / (float) (n - k);
1341  float r = rng.rand_float ();
1342  if (r < p) {
1343  break; /* found our cluster to be split */
1344  }
1345  }
1346  memcpy (centroids+ci*d, centroids+cj*d, sizeof(*centroids) * d);
1347 
1348  /* small symmetric pertubation. Much better than */
1349  for (size_t j = 0; j < d; j++)
1350  if (j % 2 == 0) {
1351  centroids[ci * d + j] *= 1 + EPS;
1352  centroids[cj * d + j] *= 1 - EPS;
1353  }
1354  else {
1355  centroids[ci * d + j] *= 1 + EPS;
1356  centroids[cj * d + j] *= 1 - EPS;
1357  }
1358 
1359  /* assume even split of the cluster */
1360  hassign[ci] = hassign[cj] / 2;
1361  hassign[cj] -= hassign[ci];
1362  nsplit++;
1363  }
1364  }
1365 
1366  return nsplit;
1367 }
1368 
1369 #undef EPS
1370 
1371 
1372 
1373 /***************************************************************************
1374  * Result list routines
1375  ***************************************************************************/
1376 
1377 
1378 void ranklist_handle_ties (int k, long *idx, const float *dis)
1379 {
1380  float prev_dis = -1e38;
1381  int prev_i = -1;
1382  for (int i = 0; i < k; i++) {
1383  if (dis[i] != prev_dis) {
1384  if (i > prev_i + 1) {
1385  // sort between prev_i and i - 1
1386  std::sort (idx + prev_i, idx + i);
1387  }
1388  prev_i = i;
1389  prev_dis = dis[i];
1390  }
1391  }
1392 }
1393 
1394 size_t ranklist_intersection_size (size_t k1, const long *v1,
1395  size_t k2, const long *v2_in)
1396 {
1397  if (k2 > k1) return ranklist_intersection_size (k2, v2_in, k1, v1);
1398  long *v2 = new long [k2];
1399  memcpy (v2, v2_in, sizeof (long) * k2);
1400  std::sort (v2, v2 + k2);
1401  { // de-dup v2
1402  long prev = -1;
1403  size_t wp = 0;
1404  for (size_t i = 0; i < k2; i++) {
1405  if (v2 [i] != prev) {
1406  v2[wp++] = prev = v2 [i];
1407  }
1408  }
1409  k2 = wp;
1410  }
1411  const long seen_flag = 1L << 60;
1412  size_t count = 0;
1413  for (size_t i = 0; i < k1; i++) {
1414  long q = v1 [i];
1415  size_t i0 = 0, i1 = k2;
1416  while (i0 + 1 < i1) {
1417  size_t imed = (i1 + i0) / 2;
1418  long piv = v2 [imed] & ~seen_flag;
1419  if (piv <= q) i0 = imed;
1420  else i1 = imed;
1421  }
1422  if (v2 [i0] == q) {
1423  count++;
1424  v2 [i0] |= seen_flag;
1425  }
1426  }
1427  delete [] v2;
1428 
1429  return count;
1430 }
1431 
1432 double imbalance_factor (int k, const int *hist) {
1433  double tot = 0, uf = 0;
1434 
1435  for (int i = 0 ; i < k ; i++) {
1436  tot += hist[i];
1437  uf += hist[i] * (double) hist[i];
1438  }
1439  uf = uf * k / (tot * tot);
1440 
1441  return uf;
1442 }
1443 
1444 
1445 double imbalance_factor (int n, int k, const long *assign) {
1446  std::vector<int> hist(k, 0);
1447  for (int i = 0; i < n; i++) {
1448  hist[assign[i]]++;
1449  }
1450 
1451  return imbalance_factor (k, hist.data());
1452 }
1453 
1454 
1455 
1456 int ivec_hist (size_t n, const int * v, int vmax, int *hist) {
1457  memset (hist, 0, sizeof(hist[0]) * vmax);
1458  int nout = 0;
1459  while (n--) {
1460  if (v[n] < 0 || v[n] >= vmax) nout++;
1461  else hist[v[n]]++;
1462  }
1463  return nout;
1464 }
1465 
1466 
1467 void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
1468 {
1469  FAISS_ASSERT (nbits % 8 == 0);
1470  size_t d = nbits / 8;
1471  std::vector<int> accu(d * 256);
1472  const uint8_t *c = codes;
1473  for (size_t i = 0; i < n; i++)
1474  for(int j = 0; j < d; j++)
1475  accu[j * 256 + *c++]++;
1476  memset (hist, 0, sizeof(*hist) * nbits);
1477  for (int i = 0; i < d; i++) {
1478  const int *ai = accu.data() + i * 256;
1479  int * hi = hist + i * 8;
1480  for (int j = 0; j < 256; j++)
1481  for (int k = 0; k < 8; k++)
1482  if ((j >> k) & 1)
1483  hi[k] += ai[j];
1484  }
1485 
1486 }
1487 
1488 
1489 
1490 size_t ivec_checksum (size_t n, const int *a)
1491 {
1492  size_t cs = 112909;
1493  while (n--) cs = cs * 65713 + a[n] * 1686049;
1494  return cs;
1495 }
1496 
1497 
1498 namespace {
1499  struct ArgsortComparator {
1500  const float *vals;
1501  bool operator() (const size_t a, const size_t b) const {
1502  return vals[a] < vals[b];
1503  }
1504  };
1505 
1506  struct SegmentS {
1507  size_t i0; // begin pointer in the permutation array
1508  size_t i1; // end
1509  size_t len() const {
1510  return i1 - i0;
1511  }
1512  };
1513 
1514  // see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
1515  // extended to > 1 merge thread
1516 
1517  // merges 2 ranges that should be consecutive on the source into
1518  // the union of the two on the destination
1519  template<typename T>
1520  void parallel_merge (const T *src, T *dst,
1521  SegmentS &s1, SegmentS & s2, int nt,
1522  const ArgsortComparator & comp) {
1523  if (s2.len() > s1.len()) { // make sure that s1 larger than s2
1524  std::swap(s1, s2);
1525  }
1526 
1527  // compute sub-ranges for each thread
1528  SegmentS s1s[nt], s2s[nt], sws[nt];
1529  s2s[0].i0 = s2.i0;
1530  s2s[nt - 1].i1 = s2.i1;
1531 
1532  // not sure parallel actually helps here
1533 #pragma omp parallel for num_threads(nt)
1534  for (int t = 0; t < nt; t++) {
1535  s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1536  s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1537 
1538  if (t + 1 < nt) {
1539  T pivot = src[s1s[t].i1];
1540  size_t i0 = s2.i0, i1 = s2.i1;
1541  while (i0 + 1 < i1) {
1542  size_t imed = (i1 + i0) / 2;
1543  if (comp (pivot, src[imed])) {i1 = imed; }
1544  else {i0 = imed; }
1545  }
1546  s2s[t].i1 = s2s[t + 1].i0 = i1;
1547  }
1548  }
1549  s1.i0 = std::min(s1.i0, s2.i0);
1550  s1.i1 = std::max(s1.i1, s2.i1);
1551  s2 = s1;
1552  sws[0].i0 = s1.i0;
1553  for (int t = 0; t < nt; t++) {
1554  sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1555  if (t + 1 < nt) {
1556  sws[t + 1].i0 = sws[t].i1;
1557  }
1558  }
1559  assert(sws[nt - 1].i1 == s1.i1);
1560 
1561  // do the actual merging
1562 #pragma omp parallel for num_threads(nt)
1563  for (int t = 0; t < nt; t++) {
1564  SegmentS sw = sws[t];
1565  SegmentS s1t = s1s[t];
1566  SegmentS s2t = s2s[t];
1567  if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1568  for (;;) {
1569  // assert (sw.len() == s1t.len() + s2t.len());
1570  if (comp(src[s1t.i0], src[s2t.i0])) {
1571  dst[sw.i0++] = src[s1t.i0++];
1572  if (s1t.i0 == s1t.i1) break;
1573  } else {
1574  dst[sw.i0++] = src[s2t.i0++];
1575  if (s2t.i0 == s2t.i1) break;
1576  }
1577  }
1578  }
1579  if (s1t.len() > 0) {
1580  assert(s1t.len() == sw.len());
1581  memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
1582  } else if (s2t.len() > 0) {
1583  assert(s2t.len() == sw.len());
1584  memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
1585  }
1586  }
1587  }
1588 
1589 };
1590 
1591 void fvec_argsort (size_t n, const float *vals,
1592  size_t *perm)
1593 {
1594  for (size_t i = 0; i < n; i++) perm[i] = i;
1595  ArgsortComparator comp = {vals};
1596  std::sort (perm, perm + n, comp);
1597 }
1598 
1599 void fvec_argsort_parallel (size_t n, const float *vals,
1600  size_t *perm)
1601 {
1602  size_t * perm2 = new size_t[n];
1603  // 2 result tables, during merging, flip between them
1604  size_t *permB = perm2, *permA = perm;
1605 
1606  int nt = omp_get_max_threads();
1607  { // prepare correct permutation so that the result ends in perm
1608  // at final iteration
1609  int nseg = nt;
1610  while (nseg > 1) {
1611  nseg = (nseg + 1) / 2;
1612  std::swap (permA, permB);
1613  }
1614  }
1615 
1616 #pragma omp parallel
1617  for (size_t i = 0; i < n; i++) permA[i] = i;
1618 
1619  ArgsortComparator comp = {vals};
1620 
1621  SegmentS segs[nt];
1622 
1623  // independent sorts
1624 #pragma omp parallel for
1625  for (int t = 0; t < nt; t++) {
1626  size_t i0 = t * n / nt;
1627  size_t i1 = (t + 1) * n / nt;
1628  SegmentS seg = {i0, i1};
1629  std::sort (permA + seg.i0, permA + seg.i1, comp);
1630  segs[t] = seg;
1631  }
1632  int prev_nested = omp_get_nested();
1633  omp_set_nested(1);
1634 
1635  int nseg = nt;
1636  while (nseg > 1) {
1637  int nseg1 = (nseg + 1) / 2;
1638  int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1639  int sub_nseg1 = nseg / 2;
1640 
1641 #pragma omp parallel for num_threads(nseg1)
1642  for (int s = 0; s < nseg; s += 2) {
1643  if (s + 1 == nseg) { // otherwise isolated segment
1644  memcpy(permB + segs[s].i0, permA + segs[s].i0,
1645  segs[s].len() * sizeof(size_t));
1646  } else {
1647  int t0 = s * sub_nt / sub_nseg1;
1648  int t1 = (s + 1) * sub_nt / sub_nseg1;
1649  printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1650  parallel_merge(permA, permB, segs[s], segs[s + 1],
1651  t1 - t0, comp);
1652  }
1653  }
1654  for (int s = 0; s < nseg; s += 2)
1655  segs[s / 2] = segs[s];
1656  nseg = nseg1;
1657  std::swap (permA, permB);
1658  }
1659  assert (permA == perm);
1660  omp_set_nested(prev_nested);
1661  delete [] perm2;
1662 }
1663 
1664 
1665 
1666 
1667 
1668 
1669 
1670 
1671 
1672 
1673 
1674 
1675 
1676 
1677 
1678 
1679 /***************************************************************************
1680  * heavily optimized table computations
1681  ***************************************************************************/
1682 
1683 
1684 static inline void fvec_madd_ref (size_t n, const float *a,
1685  float bf, const float *b, float *c) {
1686  for (size_t i = 0; i < n; i++)
1687  c[i] = a[i] + bf * b[i];
1688 }
1689 
1690 
1691 static inline void fvec_madd_sse (size_t n, const float *a,
1692  float bf, const float *b, float *c) {
1693  n >>= 2;
1694  __m128 bf4 = _mm_set_ps1 (bf);
1695  __m128 * a4 = (__m128*)a;
1696  __m128 * b4 = (__m128*)b;
1697  __m128 * c4 = (__m128*)c;
1698 
1699  while (n--) {
1700  *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1701  b4++;
1702  a4++;
1703  c4++;
1704  }
1705 }
1706 
1707 void fvec_madd (size_t n, const float *a,
1708  float bf, const float *b, float *c)
1709 {
1710  if ((n & 3) == 0 &&
1711  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1712  fvec_madd_sse (n, a, bf, b, c);
1713  else
1714  fvec_madd_ref (n, a, bf, b, c);
1715 }
1716 
1717 static inline int fvec_madd_and_argmin_ref (size_t n, const float *a,
1718  float bf, const float *b, float *c) {
1719  float vmin = 1e20;
1720  int imin = -1;
1721 
1722  for (size_t i = 0; i < n; i++) {
1723  c[i] = a[i] + bf * b[i];
1724  if (c[i] < vmin) {
1725  vmin = c[i];
1726  imin = i;
1727  }
1728  }
1729  return imin;
1730 }
1731 
1732 static inline int fvec_madd_and_argmin_sse (size_t n, const float *a,
1733  float bf, const float *b, float *c) {
1734  n >>= 2;
1735  __m128 bf4 = _mm_set_ps1 (bf);
1736  __m128 vmin4 = _mm_set_ps1 (1e20);
1737  __m128i imin4 = _mm_set1_epi32 (-1);
1738  __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1739  __m128i inc4 = _mm_set1_epi32 (4);
1740  __m128 * a4 = (__m128*)a;
1741  __m128 * b4 = (__m128*)b;
1742  __m128 * c4 = (__m128*)c;
1743 
1744  while (n--) {
1745  __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1746  *c4 = vc4;
1747  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1748  // imin4 = _mm_blendv_epi8 (imin4, idx4, mask); // slower!
1749 
1750  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1751  _mm_andnot_si128 (mask, imin4));
1752  vmin4 = _mm_min_ps (vmin4, vc4);
1753  b4++;
1754  a4++;
1755  c4++;
1756  idx4 = _mm_add_epi32 (idx4, inc4);
1757  }
1758 
1759  // 4 values -> 2
1760  {
1761  idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1762  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1763  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1764  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1765  _mm_andnot_si128 (mask, imin4));
1766  vmin4 = _mm_min_ps (vmin4, vc4);
1767  }
1768  // 2 values -> 1
1769  {
1770  idx4 = _mm_shuffle_epi32 (imin4, 1);
1771  __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1772  __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1773  imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1774  _mm_andnot_si128 (mask, imin4));
1775  // vmin4 = _mm_min_ps (vmin4, vc4);
1776  }
1777  return _mm_extract_epi32 (imin4, 0);
1778 }
1779 
1780 
1781 int fvec_madd_and_argmin (size_t n, const float *a,
1782  float bf, const float *b, float *c)
1783 {
1784  if ((n & 3) == 0 &&
1785  ((((long)a) | ((long)b) | ((long)c)) & 15) == 0)
1786  return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1787  else
1788  return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1789 }
1790 
1791 
1792 
1794  size_t d, size_t *n, size_t nmax, const float *x,
1795  bool verbose, long seed)
1796 {
1797 
1798  if (*n <= nmax) return x; // nothing to do
1799 
1800  size_t n2 = nmax;
1801  if (verbose) {
1802  printf (" Input training set too big (max size is %ld), sampling "
1803  "%ld / %ld vectors\n", nmax, n2, *n);
1804  }
1805  std::vector<int> subset (*n);
1806  rand_perm (subset.data (), *n, seed);
1807  float *x_subset = new float[n2 * d];
1808  for (long i = 0; i < n2; i++)
1809  memcpy (&x_subset[i * d],
1810  &x[subset[i] * size_t(d)],
1811  sizeof (x[0]) * d);
1812  *n = n2;
1813  return x_subset;
1814 }
1815 
1816 
1817 } // namespace faiss
random generator that can be used in multithreaded contexts
Definition: utils.h:49
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
Definition: utils.cpp:872
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
Definition: utils.cpp:1286
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
Definition: utils.cpp:432
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
Definition: utils.cpp:1467
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1793
void ranklist_handle_ties(int k, long *idx, const float *dis)
Definition: utils.cpp:1378
float rand_float()
between 0 and 1
Definition: utils.cpp:208
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1707
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
Definition: utils.cpp:1456
long rand_long()
random long &lt; 2 ^ 62
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
Definition: utils.cpp:1394
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
Definition: utils.cpp:1228
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
Definition: utils.cpp:1166
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
Definition: utils.cpp:832
double getmillisecs()
ms elapsed since some arbitrary epoch
Definition: utils.cpp:71
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
Definition: utils.cpp:1432
float fvec_norm_L2sqr(const float *x, size_t d)
Definition: utils.cpp:513
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
Definition: utils.cpp:1151
void matrix_qr(int m, int n, float *a)
Definition: utils.cpp:1206
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
Definition: utils.cpp:1490
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
Definition: utils.cpp:1781
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
Definition: utils.cpp:852