18 #include <sys/types.h>
23 #include <immintrin.h>
28 #include "AuxIndexStructures.h"
29 #include "FaissAssert.h"
42 int sgemm_ (
const char *transa,
const char *transb, FINTEGER *m, FINTEGER *
43 n, FINTEGER *k,
const float *alpha,
const float *a,
44 FINTEGER *lda,
const float *b, FINTEGER *
45 ldb,
float *beta,
float *c, FINTEGER *ldc);
49 int sgeqrf_ (FINTEGER *m, FINTEGER *n,
float *a, FINTEGER *lda,
50 float *tau,
float *work, FINTEGER *lwork, FINTEGER *info);
52 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k,
float *a,
53 FINTEGER *lda,
float *tau,
float *work,
54 FINTEGER *lwork, FINTEGER *info);
56 int sgemv_(
const char *trans, FINTEGER *m, FINTEGER *n,
float *alpha,
57 const float *a, FINTEGER *lda,
const float *x, FINTEGER *incx,
58 float *beta,
float *y, FINTEGER *incy);
71 gettimeofday (&tv,
nullptr);
72 return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
82 snprintf (fname, 256,
"/proc/%d/status", pid);
83 FILE * f = fopen (fname,
"r");
84 FAISS_THROW_IF_NOT_MSG (f,
"cannot open proc status file");
88 if (!fgets (buf, 256, f))
break;
89 if (sscanf (buf,
"VmRSS: %ld kB", &sz) == 1)
break;
99 fprintf(stderr,
"WARN: get_mem_usage_kb not implemented on the mac\n");
111 RandomGenerator::RandomGenerator (
long seed)
112 : mt((unsigned int)seed) {}
114 int RandomGenerator::rand_int ()
116 return mt() & 0x7fffffff;
119 long RandomGenerator::rand_long ()
121 return long(rand_int()) | long(rand_int()) << 31;
124 int RandomGenerator::rand_int (
int max)
129 float RandomGenerator::rand_float ()
131 return mt() / float(mt.max());
134 double RandomGenerator::rand_double ()
136 return mt() / double(mt.max());
148 void float_rand (
float * x,
size_t n,
long seed)
151 const size_t nblock = n < 1024 ? 1 : 1024;
153 RandomGenerator rng0 (seed);
154 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
156 #pragma omp parallel for
157 for (
size_t j = 0; j < nblock; j++) {
159 RandomGenerator rng (a0 + j * b0);
161 const size_t istart = j * n / nblock;
162 const size_t iend = (j + 1) * n / nblock;
164 for (
size_t i = istart; i < iend; i++)
165 x[i] = rng.rand_float ();
170 void float_randn (
float * x,
size_t n,
long seed)
173 const size_t nblock = n < 1024 ? 1 : 1024;
175 RandomGenerator rng0 (seed);
176 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
178 #pragma omp parallel for
179 for (
size_t j = 0; j < nblock; j++) {
180 RandomGenerator rng (a0 + j * b0);
182 double a = 0, b = 0, s = 0;
185 const size_t istart = j * n / nblock;
186 const size_t iend = (j + 1) * n / nblock;
188 for (
size_t i = istart; i < iend; i++) {
192 a = 2.0 * rng.rand_double () - 1;
193 b = 2.0 * rng.rand_double () - 1;
196 x[i] = a * sqrt(-2.0 * log(s) / s);
199 x[i] = b * sqrt(-2.0 * log(s) / s);
207 void long_rand (
long * x,
size_t n,
long seed)
210 const size_t nblock = n < 1024 ? 1 : 1024;
212 RandomGenerator rng0 (seed);
213 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
215 #pragma omp parallel for
216 for (
size_t j = 0; j < nblock; j++) {
218 RandomGenerator rng (a0 + j * b0);
220 const size_t istart = j * n / nblock;
221 const size_t iend = (j + 1) * n / nblock;
222 for (
size_t i = istart; i < iend; i++)
223 x[i] = rng.rand_long ();
229 void rand_perm (
int *perm,
size_t n,
long seed)
231 for (
size_t i = 0; i < n; i++) perm[i] = i;
233 RandomGenerator rng (seed);
235 for (
size_t i = 0; i + 1 < n; i++) {
236 int i2 = i + rng.rand_int (n - i);
237 std::swap(perm[i], perm[i2]);
244 void byte_rand (uint8_t * x,
size_t n,
long seed)
247 const size_t nblock = n < 1024 ? 1 : 1024;
249 RandomGenerator rng0 (seed);
250 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
252 #pragma omp parallel for
253 for (
size_t j = 0; j < nblock; j++) {
255 RandomGenerator rng (a0 + j * b0);
257 const size_t istart = j * n / nblock;
258 const size_t iend = (j + 1) * n / nblock;
261 for (i = istart; i < iend; i++)
262 x[i] = rng.rand_long ();
268 void reflection (
const float * __restrict u,
269 float * __restrict x,
270 size_t n,
size_t d,
size_t nu)
273 for (i = 0; i < n; i++) {
274 const float * up = u;
275 for (l = 0; l < nu; l++) {
276 float ip1 = 0, ip2 = 0;
278 for (j = 0; j < d; j+=2) {
280 ip2 += up[j+1] * x[j+1];
282 float ip = 2 * (ip1 + ip2);
284 for (j = 0; j < d; j++)
294 void reflection_ref (
const float * u,
float * x,
size_t n,
size_t d,
size_t nu)
297 for (i = 0; i < n; i++) {
298 const float * up = u;
299 for (l = 0; l < nu; l++) {
302 for (j = 0; j < d; j++)
306 for (j = 0; j < d; j++)
329 void fvec_inner_products_ny (
float * ip,
339 float one = 1.0, zero = 0.0;
341 sgemv_ (
"T", &di, &nyi, &one, y, &di, x, &onei, &zero, ip, &onei);
344 for (
size_t i = 0; i < ny; i++) {
345 ip[i] = fvec_inner_product (x, y, d);
355 void fvec_norms_L2 (
float * __restrict nr,
356 const float * __restrict x,
360 #pragma omp parallel for
361 for (
size_t i = 0; i < nx; i++) {
362 nr[i] = sqrtf (fvec_norm_L2sqr (x + i * d, d));
366 void fvec_norms_L2sqr (
float * __restrict nr,
367 const float * __restrict x,
370 #pragma omp parallel for
371 for (
size_t i = 0; i < nx; i++)
372 nr[i] = fvec_norm_L2sqr (x + i * d, d);
377 void fvec_renorm_L2 (
size_t d,
size_t nx,
float * __restrict x)
379 #pragma omp parallel for
380 for (
size_t i = 0; i < nx; i++) {
381 float * __restrict xi = x + i * d;
387 const float inv_nr = 1.0 / sqrtf (nr);
388 for (j = 0; j < d; j++)
417 static void knn_inner_product_sse (
const float * x,
419 size_t d,
size_t nx,
size_t ny,
420 float_minheap_array_t * res)
423 size_t check_period = InterruptCallback::get_period_hint (ny * d);
425 for (
size_t i0 = 0; i0 < nx; i0 += check_period) {
426 size_t i1 = std::min(i0 + check_period, nx);
428 #pragma omp parallel for
429 for (
size_t i = i0; i < i1; i++) {
430 const float * x_i = x + i * d;
431 const float * y_j = y;
433 float * __restrict simi = res->get_val(i);
434 long * __restrict idxi = res->get_ids (i);
436 minheap_heapify (k, simi, idxi);
438 for (
size_t j = 0; j < ny; j++) {
439 float ip = fvec_inner_product (x_i, y_j, d);
442 minheap_pop (k, simi, idxi);
443 minheap_push (k, simi, idxi, ip, j);
447 minheap_reorder (k, simi, idxi);
449 InterruptCallback::check ();
454 static void knn_L2sqr_sse (
457 size_t d,
size_t nx,
size_t ny,
458 float_maxheap_array_t * res)
462 size_t check_period = InterruptCallback::get_period_hint (ny * d);
464 for (
size_t i0 = 0; i0 < nx; i0 += check_period) {
465 size_t i1 = std::min(i0 + check_period, nx);
467 #pragma omp parallel for
468 for (
size_t i = i0; i < i1; i++) {
469 const float * x_i = x + i * d;
470 const float * y_j = y;
472 float * simi = res->get_val(i);
473 long * idxi = res->get_ids (i);
475 maxheap_heapify (k, simi, idxi);
476 for (j = 0; j < ny; j++) {
479 if (disij < simi[0]) {
480 maxheap_pop (k, simi, idxi);
481 maxheap_push (k, simi, idxi, disij, j);
485 maxheap_reorder (k, simi, idxi);
487 InterruptCallback::check ();
494 static void knn_inner_product_blas (
497 size_t d,
size_t nx,
size_t ny,
498 float_minheap_array_t * res)
503 if (nx == 0 || ny == 0)
return;
506 const size_t bs_x = 4096, bs_y = 1024;
508 std::unique_ptr<float[]> ip_block(
new float[bs_x * bs_y]);
510 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
511 size_t i1 = i0 + bs_x;
514 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
515 size_t j1 = j0 + bs_y;
516 if (j1 > ny) j1 = ny;
519 float one = 1, zero = 0;
520 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
521 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
523 x + i0 * d, &di, &zero,
524 ip_block.get(), &nyi);
528 res->addn (j1 - j0, ip_block.get(), j0, i0, i1 - i0);
530 InterruptCallback::check ();
537 template<
class DistanceCorrection>
538 static void knn_L2sqr_blas (
const float * x,
540 size_t d,
size_t nx,
size_t ny,
541 float_maxheap_array_t * res,
542 const DistanceCorrection &corr)
547 if (nx == 0 || ny == 0)
return;
552 const size_t bs_x = 4096, bs_y = 1024;
554 float *ip_block =
new float[bs_x * bs_y];
555 float *x_norms =
new float[nx];
556 float *y_norms =
new float[ny];
557 ScopeDeleter<float> del1(ip_block), del3(x_norms), del2(y_norms);
559 fvec_norms_L2sqr (x_norms, x, d, nx);
560 fvec_norms_L2sqr (y_norms, y, d, ny);
563 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
564 size_t i1 = i0 + bs_x;
567 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
568 size_t j1 = j0 + bs_y;
569 if (j1 > ny) j1 = ny;
572 float one = 1, zero = 0;
573 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
574 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
576 x + i0 * d, &di, &zero,
581 #pragma omp parallel for
582 for (
size_t i = i0; i < i1; i++) {
583 float * __restrict simi = res->get_val(i);
584 long * __restrict idxi = res->get_ids (i);
585 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
587 for (
size_t j = j0; j < j1; j++) {
588 float ip = *ip_line++;
589 float dis = x_norms[i] + y_norms[j] - 2 * ip;
593 if (dis < 0) dis = 0;
595 dis = corr (dis, i, j);
598 maxheap_pop (k, simi, idxi);
599 maxheap_push (k, simi, idxi, dis, j);
604 InterruptCallback::check ();
622 int distance_compute_blas_threshold = 20;
626 size_t d,
size_t nx,
size_t ny,
629 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
630 knn_inner_product_sse (x, y, d, nx, ny, res);
632 knn_inner_product_blas (x, y, d, nx, ny, res);
639 float operator()(
float dis,
size_t ,
size_t )
const {
646 size_t d,
size_t nx,
size_t ny,
649 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
650 knn_L2sqr_sse (x, y, d, nx, ny, res);
653 knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
658 const float *base_shift;
659 float operator()(
float dis,
size_t ,
size_t bno)
const {
660 return dis - base_shift[bno];
667 size_t d,
size_t nx,
size_t ny,
669 const float *base_shift)
672 knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
683 void fvec_inner_products_by_idx (
float * __restrict ip,
686 const long * __restrict ids,
687 size_t d,
size_t nx,
size_t ny)
689 #pragma omp parallel for
690 for (
size_t j = 0; j < nx; j++) {
691 const long * __restrict idsj = ids + j * ny;
692 const float * xj = x + j * d;
693 float * __restrict ipj = ip + j * ny;
694 for (
size_t i = 0; i < ny; i++) {
697 ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
704 void fvec_L2sqr_by_idx (
float * __restrict dis,
707 const long * __restrict ids,
708 size_t d,
size_t nx,
size_t ny)
710 #pragma omp parallel for
711 for (
size_t j = 0; j < nx; j++) {
712 const long * __restrict idsj = ids + j * ny;
713 const float * xj = x + j * d;
714 float * __restrict disj = dis + j * ny;
715 for (
size_t i = 0; i < ny; i++) {
718 disj[i] =
fvec_L2sqr (xj, y + d * idsj[i], d);
729 void knn_inner_products_by_idx (
const float * x,
732 size_t d,
size_t nx,
size_t ny,
733 float_minheap_array_t * res)
737 #pragma omp parallel for
738 for (
size_t i = 0; i < nx; i++) {
739 const float * x_ = x + i * d;
740 const long * idsi = ids + i * ny;
742 float * __restrict simi = res->get_val(i);
743 long * __restrict idxi = res->get_ids (i);
744 minheap_heapify (k, simi, idxi);
746 for (j = 0; j < ny; j++) {
747 if (idsi[j] < 0)
break;
748 float ip = fvec_inner_product (x_, y + d * idsi[j], d);
751 minheap_pop (k, simi, idxi);
752 minheap_push (k, simi, idxi, ip, idsi[j]);
755 minheap_reorder (k, simi, idxi);
760 void knn_L2sqr_by_idx (
const float * x,
762 const long * __restrict ids,
763 size_t d,
size_t nx,
size_t ny,
764 float_maxheap_array_t * res)
768 #pragma omp parallel for
769 for (
size_t i = 0; i < nx; i++) {
770 const float * x_ = x + i * d;
771 const long * __restrict idsi = ids + i * ny;
772 float * __restrict simi = res->get_val(i);
773 long * __restrict idxi = res->get_ids (i);
774 maxheap_heapify (res->k, simi, idxi);
775 for (
size_t j = 0; j < ny; j++) {
776 float disij =
fvec_L2sqr (x_, y + d * idsi[j], d);
778 if (disij < simi[0]) {
779 maxheap_pop (k, simi, idxi);
780 maxheap_push (k, simi, idxi, disij, idsi[j]);
783 maxheap_reorder (res->k, simi, idxi);
799 template <
bool compute_l2>
800 static void range_search_blas (
803 size_t d,
size_t nx,
size_t ny,
805 RangeSearchResult *result)
809 if (nx == 0 || ny == 0)
return;
812 const size_t bs_x = 4096, bs_y = 1024;
814 float *ip_block =
new float[bs_x * bs_y];
815 ScopeDeleter<float> del0(ip_block);
817 float *x_norms =
nullptr, *y_norms =
nullptr;
818 ScopeDeleter<float> del1, del2;
820 x_norms =
new float[nx];
822 fvec_norms_L2sqr (x_norms, x, d, nx);
824 y_norms =
new float[ny];
826 fvec_norms_L2sqr (y_norms, y, d, ny);
829 std::vector <RangeSearchPartialResult *> partial_results;
831 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
832 size_t j1 = j0 + bs_y;
833 if (j1 > ny) j1 = ny;
834 RangeSearchPartialResult * pres =
new RangeSearchPartialResult (result);
835 partial_results.push_back (pres);
837 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
838 size_t i1 = i0 + bs_x;
843 float one = 1, zero = 0;
844 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
845 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
847 x + i0 * d, &di, &zero,
852 for (
size_t i = i0; i < i1; i++) {
853 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
855 RangeQueryResult & qres = pres->new_result (i);
857 for (
size_t j = j0; j < j1; j++) {
858 float ip = *ip_line++;
860 float dis = x_norms[i] + y_norms[j] - 2 * ip;
872 InterruptCallback::check ();
875 RangeSearchPartialResult::merge (partial_results);
879 template <
bool compute_l2>
880 static void range_search_sse (
const float * x,
882 size_t d,
size_t nx,
size_t ny,
884 RangeSearchResult *res)
886 FAISS_THROW_IF_NOT (d % 4 == 0);
888 size_t check_period = InterruptCallback::get_period_hint (ny * d);
889 bool interrupted =
false;
893 RangeSearchPartialResult pres (res);
895 for (
size_t i0 = 0; i0 < nx; i0 += check_period) {
896 size_t i1 = std::min(i0 + check_period, nx);
899 for (
size_t i = i0; i < i1; i++) {
900 const float * x_ = x + i * d;
901 const float * y_ = y;
904 RangeQueryResult & qres = pres.new_result (i);
906 for (j = 0; j < ny; j++) {
909 if (disij < radius) {
913 float ip = fvec_inner_product (x_, y_, d);
923 if (InterruptCallback::is_interrupted ()) {
936 FAISS_THROW_MSG (
"computation interrupted");
947 size_t d,
size_t nx,
size_t ny,
952 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
953 range_search_sse<true> (x, y, d, nx, ny, radius, res);
955 range_search_blas<true> (x, y, d, nx, ny, radius, res);
962 size_t d,
size_t nx,
size_t ny,
967 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
968 range_search_sse<false> (x, y, d, nx, ny, radius, res);
970 range_search_blas<false> (x, y, d, nx, ny, radius, res);
984 void inner_product_to_L2sqr (
float * __restrict dis,
987 size_t n1,
size_t n2)
990 #pragma omp parallel for
991 for (
size_t j = 0 ; j < n1 ; j++) {
992 float * disj = dis + j * n2;
993 for (
size_t i = 0 ; i < n2 ; i++)
994 disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1001 FAISS_THROW_IF_NOT (m >= n);
1002 FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1003 std::vector<float> tau (ki);
1004 FINTEGER lwork = -1, info;
1007 sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1008 &work_size, &lwork, &info);
1009 lwork = size_t(work_size);
1010 std::vector<float> work (lwork);
1012 sgeqrf_ (&mi, &ni, a, &mi,
1013 tau.data(), work.data(), &lwork, &info);
1015 sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1016 work.data(), &lwork, &info);
1022 long nq,
const float *xq,
1023 long nb,
const float *xb,
1025 long ldq,
long ldb,
long ldd)
1027 if (nq == 0 || nb == 0)
return;
1028 if (ldq == -1) ldq = d;
1029 if (ldb == -1) ldb = d;
1030 if (ldd == -1) ldd = nb;
1033 float *b_norms = dis;
1035 #pragma omp parallel for
1036 for (
long i = 0; i < nb; i++)
1039 #pragma omp parallel for
1040 for (
long i = 1; i < nq; i++) {
1042 for (
long j = 0; j < nb; j++)
1043 dis[i * ldd + j] = q_norm + b_norms [j];
1048 for (
long j = 0; j < nb; j++)
1053 FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1054 float one = 1.0, minus_2 = -2.0;
1056 sgemm_ (
"Transposed",
"Not transposed",
1075 #define EPS (1 / 1024.)
1081 size_t d,
size_t k,
size_t n,
1085 centroids += k_frozen * d;
1087 std::vector<size_t> hassign(k);
1088 memset (centroids, 0,
sizeof(*centroids) * d * k);
1090 #pragma omp parallel
1092 int nt = omp_get_num_threads();
1093 int rank = omp_get_thread_num();
1095 size_t c0 = (k * rank) / nt;
1096 size_t c1 = (k * (rank + 1)) / nt;
1097 const float *xi = x;
1100 for (
size_t i = 0; i < n; i++) {
1101 long ci = assign[i];
1102 assert (ci >= 0 && ci < k + k_frozen);
1104 if (ci >= c0 && ci < c1) {
1105 float * c = centroids + ci * d;
1107 for (
size_t j = 0; j < d; j++)
1116 #pragma omp parallel for
1117 for (
size_t ci = 0; ci < k; ci++) {
1118 float * c = centroids + ci * d;
1119 float ni = (float) hassign[ci];
1121 for (
size_t j = 0; j < d; j++)
1129 for (
size_t ci = 0; ci < k; ci++) {
1130 if (hassign[ci] == 0) {
1132 for (cj = 0; 1; cj = (cj + 1) % k) {
1134 float p = (hassign[cj] - 1.0) / (float) (n - k);
1140 memcpy (centroids+ci*d, centroids+cj*d,
sizeof(*centroids) * d);
1143 for (
size_t j = 0; j < d; j++) {
1145 centroids[ci * d + j] *= 1 + EPS;
1146 centroids[cj * d + j] *= 1 - EPS;
1148 centroids[ci * d + j] *= 1 - EPS;
1149 centroids[cj * d + j] *= 1 + EPS;
1154 hassign[ci] = hassign[cj] / 2;
1155 hassign[cj] -= hassign[ci];
1174 float prev_dis = -1e38;
1176 for (
int i = 0; i < k; i++) {
1177 if (dis[i] != prev_dis) {
1178 if (i > prev_i + 1) {
1180 std::sort (idx + prev_i, idx + i);
1189 long *I0,
float *D0,
1190 const long *I1,
const float *D1,
1196 #pragma omp parallel reduction(+:n1)
1198 std::vector<long> tmpI (k);
1199 std::vector<float> tmpD (k);
1202 for (
size_t i = 0; i < n; i++) {
1203 long *lI0 = I0 + i * k;
1204 float *lD0 = D0 + i * k;
1205 const long *lI1 = I1 + i * k;
1206 const float *lD1 = D1 + i * k;
1211 for (
size_t j = 0; j < k; j++) {
1213 if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1217 }
else if (lD1[r1] >= 0) {
1219 tmpI[j] = lI1[r1] + translation;
1227 for (
size_t j = 0; j < k; j++) {
1228 if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1232 }
else if (lD1[r1] >= 0) {
1234 tmpI[j] = lI1[r1] + translation;
1243 memcpy (lD0, tmpD.data(),
sizeof (lD0[0]) * k);
1244 memcpy (lI0, tmpI.data(),
sizeof (lI0[0]) * k);
1254 size_t k2,
const long *v2_in)
1257 long *v2 =
new long [k2];
1258 memcpy (v2, v2_in,
sizeof (
long) * k2);
1259 std::sort (v2, v2 + k2);
1263 for (
size_t i = 0; i < k2; i++) {
1264 if (v2 [i] != prev) {
1265 v2[wp++] = prev = v2 [i];
1270 const long seen_flag = 1L << 60;
1272 for (
size_t i = 0; i < k1; i++) {
1274 size_t i0 = 0, i1 = k2;
1275 while (i0 + 1 < i1) {
1276 size_t imed = (i1 + i0) / 2;
1277 long piv = v2 [imed] & ~seen_flag;
1278 if (piv <= q) i0 = imed;
1283 v2 [i0] |= seen_flag;
1292 double tot = 0, uf = 0;
1294 for (
int i = 0 ; i < k ; i++) {
1296 uf += hist[i] * (double) hist[i];
1298 uf = uf * k / (tot * tot);
1305 std::vector<int> hist(k, 0);
1306 for (
int i = 0; i < n; i++) {
1315 int ivec_hist (
size_t n,
const int * v,
int vmax,
int *hist) {
1316 memset (hist, 0,
sizeof(hist[0]) * vmax);
1319 if (v[n] < 0 || v[n] >= vmax) nout++;
1328 FAISS_THROW_IF_NOT (nbits % 8 == 0);
1329 size_t d = nbits / 8;
1330 std::vector<int> accu(d * 256);
1331 const uint8_t *c = codes;
1332 for (
size_t i = 0; i < n; i++)
1333 for(
int j = 0; j < d; j++)
1334 accu[j * 256 + *c++]++;
1335 memset (hist, 0,
sizeof(*hist) * nbits);
1336 for (
int i = 0; i < d; i++) {
1337 const int *ai = accu.data() + i * 256;
1338 int * hi = hist + i * 8;
1339 for (
int j = 0; j < 256; j++)
1340 for (
int k = 0; k < 8; k++)
1352 while (n--) cs = cs * 65713 + a[n] * 1686049;
1358 struct ArgsortComparator {
1360 bool operator() (
const size_t a,
const size_t b)
const {
1361 return vals[a] < vals[b];
1368 size_t len()
const {
1378 template<
typename T>
1379 void parallel_merge (
const T *src, T *dst,
1380 SegmentS &s1, SegmentS & s2,
int nt,
1381 const ArgsortComparator & comp) {
1382 if (s2.len() > s1.len()) {
1387 SegmentS s1s[nt], s2s[nt], sws[nt];
1389 s2s[nt - 1].i1 = s2.i1;
1392 #pragma omp parallel for num_threads(nt)
1393 for (
int t = 0; t < nt; t++) {
1394 s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1395 s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1398 T pivot = src[s1s[t].i1];
1399 size_t i0 = s2.i0, i1 = s2.i1;
1400 while (i0 + 1 < i1) {
1401 size_t imed = (i1 + i0) / 2;
1402 if (comp (pivot, src[imed])) {i1 = imed; }
1405 s2s[t].i1 = s2s[t + 1].i0 = i1;
1408 s1.i0 = std::min(s1.i0, s2.i0);
1409 s1.i1 = std::max(s1.i1, s2.i1);
1412 for (
int t = 0; t < nt; t++) {
1413 sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1415 sws[t + 1].i0 = sws[t].i1;
1418 assert(sws[nt - 1].i1 == s1.i1);
1421 #pragma omp parallel for num_threads(nt)
1422 for (
int t = 0; t < nt; t++) {
1423 SegmentS sw = sws[t];
1424 SegmentS s1t = s1s[t];
1425 SegmentS s2t = s2s[t];
1426 if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1429 if (comp(src[s1t.i0], src[s2t.i0])) {
1430 dst[sw.i0++] = src[s1t.i0++];
1431 if (s1t.i0 == s1t.i1)
break;
1433 dst[sw.i0++] = src[s2t.i0++];
1434 if (s2t.i0 == s2t.i1)
break;
1438 if (s1t.len() > 0) {
1439 assert(s1t.len() == sw.len());
1440 memcpy(dst + sw.i0, src + s1t.i0, s1t.len() *
sizeof(dst[0]));
1441 }
else if (s2t.len() > 0) {
1442 assert(s2t.len() == sw.len());
1443 memcpy(dst + sw.i0, src + s2t.i0, s2t.len() *
sizeof(dst[0]));
1450 void fvec_argsort (
size_t n,
const float *vals,
1453 for (
size_t i = 0; i < n; i++) perm[i] = i;
1454 ArgsortComparator comp = {vals};
1455 std::sort (perm, perm + n, comp);
1458 void fvec_argsort_parallel (
size_t n,
const float *vals,
1461 size_t * perm2 =
new size_t[n];
1463 size_t *permB = perm2, *permA = perm;
1465 int nt = omp_get_max_threads();
1470 nseg = (nseg + 1) / 2;
1471 std::swap (permA, permB);
1475 #pragma omp parallel
1476 for (
size_t i = 0; i < n; i++) permA[i] = i;
1478 ArgsortComparator comp = {vals};
1483 #pragma omp parallel for
1484 for (
int t = 0; t < nt; t++) {
1485 size_t i0 = t * n / nt;
1486 size_t i1 = (t + 1) * n / nt;
1487 SegmentS seg = {i0, i1};
1488 std::sort (permA + seg.i0, permA + seg.i1, comp);
1491 int prev_nested = omp_get_nested();
1496 int nseg1 = (nseg + 1) / 2;
1497 int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1498 int sub_nseg1 = nseg / 2;
1500 #pragma omp parallel for num_threads(nseg1)
1501 for (
int s = 0; s < nseg; s += 2) {
1502 if (s + 1 == nseg) {
1503 memcpy(permB + segs[s].i0, permA + segs[s].i0,
1504 segs[s].len() *
sizeof(
size_t));
1506 int t0 = s * sub_nt / sub_nseg1;
1507 int t1 = (s + 1) * sub_nt / sub_nseg1;
1508 printf(
"merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1509 parallel_merge(permA, permB, segs[s], segs[s + 1],
1513 for (
int s = 0; s < nseg; s += 2)
1514 segs[s / 2] = segs[s];
1516 std::swap (permA, permB);
1518 assert (permA == perm);
1519 omp_set_nested(prev_nested);
1541 size_t d,
size_t *n,
size_t nmax,
const float *x,
1542 bool verbose,
long seed)
1545 if (*n <= nmax)
return x;
1549 printf (
" Input training set too big (max size is %ld), sampling "
1550 "%ld / %ld vectors\n", nmax, n2, *n);
1552 std::vector<int> subset (*n);
1553 rand_perm (subset.data (), *n, seed);
1554 float *x_subset =
new float[n2 * d];
1555 for (
long i = 0; i < n2; i++)
1556 memcpy (&x_subset[i * d],
1557 &x[subset[i] *
size_t(d)],
1565 for (
size_t i = 0; i < d; ++i) {
1566 x_out[i] = 2 * ((x_in[i >> 3] >> (i & 7)) & 1) - 1;
1571 for (
size_t i = 0; i < d / 8; ++i) {
1573 for (
int j = 0; j < 8; ++j) {
1574 if (x_in[8 * i + j] > 0) {
1585 const uint8_t *p = bytes;
1586 uint64_t x = (uint64_t)(*p) << 7;
1588 while (--len >= 0) {
1589 x = (1000003*x) ^ *p++;
1597 omp_set_num_threads(10);
1599 if (omp_get_max_threads() != 10) {
1603 std::vector<int> nt_per_thread(10);
1605 bool in_parallel =
true;
1606 #pragma omp parallel reduction(+: sum)
1608 if (!omp_in_parallel()) {
1609 in_parallel =
false;
1612 int nt = omp_get_num_threads();
1613 int rank = omp_get_thread_num();
1615 nt_per_thread[rank] = nt;
1617 for(
int i = 0; i < 1000 * 1000 * 10; i++) {
1625 if (nt_per_thread[0] != 10) {
random generator that can be used in multithreaded contexts
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
void ranklist_handle_ties(int k, long *idx, const float *dis)
float rand_float()
between 0 and 1
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
uint64_t hash_bytes(const uint8_t *bytes, long n)
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
double getmillisecs()
ms elapsed since some arbitrary epoch
void real_to_binary(size_t d, const float *x_in, uint8_t *x_out)
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
float fvec_norm_L2sqr(const float *x, size_t d)
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
void matrix_qr(int m, int n, float *a)
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)
void binary_to_real(size_t d, const uint8_t *x_in, float *x_out)