21 #include <smmintrin.h>
25 #include <sys/types.h>
34 #include "AuxIndexStructures.h"
35 #include "FaissAssert.h"
48 int sgemm_ (
const char *transa,
const char *transb, FINTEGER *m, FINTEGER *
49 n, FINTEGER *k,
const float *alpha,
const float *a,
50 FINTEGER *lda,
const float *b, FINTEGER *
51 ldb,
float *beta,
float *c, FINTEGER *ldc);
55 int sgeqrf_ (FINTEGER *m, FINTEGER *n,
float *a, FINTEGER *lda,
56 float *tau,
float *work, FINTEGER *lwork, FINTEGER *info);
58 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k,
float *a,
59 FINTEGER *lda,
float *tau,
float *work,
60 FINTEGER *lwork, FINTEGER *info);
74 gettimeofday (&tv,
nullptr);
75 return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
85 snprintf (fname, 256,
"/proc/%d/status", pid);
86 FILE * f = fopen (fname,
"r");
87 FAISS_ASSERT (f || !
"cannot open proc status file") ;
91 if (!fgets (buf, 256, f))
break;
92 if (sscanf (buf,
"VmRSS: %ld kB", &sz) == 1)
break;
102 fprintf(stderr,
"WARN: get_mem_usage_kb not implemented on the mac\n");
140 random_r (&rand_data, &a);
147 random_r (&rand_data, &a);
148 random_r (&rand_data, &b);
149 return long(a) | long(b) << 31;
155 memset (&rand_data, 0,
sizeof (rand_data));
156 initstate_r (seed, rand_state,
sizeof (rand_state), &rand_data);
162 memcpy (rand_state, other.rand_state,
sizeof(rand_state));
163 rand_data = other.rand_data;
164 setstate_r (rand_state, &rand_data);
171 int rand_r(
unsigned *seed);
182 rand_state = other.rand_state;
190 int lowbits = rand_r(&rand_state) >> 15;
191 return rand_r(&rand_state) ^ lowbits;
196 return long(random()) | long(random()) << 31;
214 double RandomGenerator::rand_double ()
228 void float_rand (
float * x,
size_t n,
long seed)
231 const size_t nblock = n < 1024 ? 1 : 1024;
233 RandomGenerator rng0 (seed);
234 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
236 #pragma omp parallel for
237 for (
size_t j = 0; j < nblock; j++) {
239 RandomGenerator rng (a0 + j * b0);
241 const size_t istart = j * n / nblock;
242 const size_t iend = (j + 1) * n / nblock;
244 for (
size_t i = istart; i < iend; i++)
245 x[i] = rng.rand_float ();
250 void float_randn (
float * x,
size_t n,
long seed)
253 const size_t nblock = n < 1024 ? 1 : 1024;
255 RandomGenerator rng0 (seed);
256 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
258 #pragma omp parallel for
259 for (
size_t j = 0; j < nblock; j++) {
260 RandomGenerator rng (a0 + j * b0);
262 double a = 0, b = 0, s = 0;
265 const size_t istart = j * n / nblock;
266 const size_t iend = (j + 1) * n / nblock;
268 for (
size_t i = istart; i < iend; i++) {
272 a = 2.0 * rng.rand_double () - 1;
273 b = 2.0 * rng.rand_double () - 1;
276 x[i] = a * sqrt(-2.0 * log(s) / s);
279 x[i] = b * sqrt(-2.0 * log(s) / s);
287 void long_rand (
long * x,
size_t n,
long seed)
290 const size_t nblock = n < 1024 ? 1 : 1024;
292 RandomGenerator rng0 (seed);
293 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
295 #pragma omp parallel for
296 for (
size_t j = 0; j < nblock; j++) {
298 RandomGenerator rng (a0 + j * b0);
300 const size_t istart = j * n / nblock;
301 const size_t iend = (j + 1) * n / nblock;
302 for (
size_t i = istart; i < iend; i++)
303 x[i] = rng.rand_long ();
309 void rand_perm (
int *perm,
size_t n,
long seed)
311 for (
size_t i = 0; i < n; i++) perm[i] = i;
313 RandomGenerator rng (seed);
315 for (
size_t i = 0; i + 1 < n; i++) {
316 int i2 = i + rng.rand_int (n - i);
317 std::swap(perm[i], perm[i2]);
324 void byte_rand (uint8_t * x,
size_t n,
long seed)
327 const size_t nblock = n < 1024 ? 1 : 1024;
329 RandomGenerator rng0 (seed);
330 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
332 #pragma omp parallel for
333 for (
size_t j = 0; j < nblock; j++) {
335 RandomGenerator rng (a0 + j * b0);
337 const size_t istart = j * n / nblock;
338 const size_t iend = (j + 1) * n / nblock;
341 for (i = istart; i < iend; i++)
342 x[i] = rng.rand_long ();
348 void reflection (
const float * __restrict u,
349 float * __restrict x,
350 size_t n,
size_t d,
size_t nu)
353 for (i = 0; i < n; i++) {
354 const float * up = u;
355 for (l = 0; l < nu; l++) {
356 float ip1 = 0, ip2 = 0;
358 for (j = 0; j < d; j+=2) {
360 ip2 += up[j+1] * x[j+1];
362 float ip = 2 * (ip1 + ip2);
364 for (j = 0; j < d; j++)
374 void reflection_ref (
const float * u,
float * x,
size_t n,
size_t d,
size_t nu)
377 for (i = 0; i < n; i++) {
378 const float * up = u;
379 for (l = 0; l < nu; l++) {
382 for (j = 0; j < d; j++)
386 for (j = 0; j < d; j++)
419 static inline __m128 maskout (
int d, __m128 x)
422 __m128i d4 = _mm_set1_epi32 (d);
423 __m128i lim = _mm_set_epi32 (3, 2, 1, 0);
424 __m128i mask = _mm_cmpgt_epi32 (d4, lim);
426 return _mm_and_ps (x, (__m128)mask);
436 __m128 msum1 = _mm_setzero_ps();
439 mx = _mm_loadu_ps (x); x += 4;
440 my = _mm_loadu_ps (y); y += 4;
441 const __m128 a_m_b1 = _mm_sub_ps (mx, my);
442 msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
447 mx = _mm_loadu_ps (x);
448 my = _mm_loadu_ps (y);
449 __m128 a_m_b1 = maskout (d, _mm_sub_ps (mx, my));
451 msum1 = _mm_add_ps (msum1, _mm_mul_ps (a_m_b1, a_m_b1));
453 msum1 = _mm_hadd_ps (msum1, msum1);
454 msum1 = _mm_hadd_ps (msum1, msum1);
455 return _mm_cvtss_f32 (msum1);
460 float fvec_L2sqr_ref (
const float * x,
466 for (i = 0; i < d; i++) {
467 const float tmp = x[i] - y[i];
473 float fvec_inner_product (
const float * x,
478 __m128 msum1 = _mm_setzero_ps();
481 mx = _mm_loadu_ps (x); x += 4;
482 my = _mm_loadu_ps (y); y += 4;
483 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
488 mx = _mm_loadu_ps (x);
489 my = _mm_loadu_ps (y);
490 __m128 prod = maskout (d, _mm_mul_ps (mx, my));
492 msum1 = _mm_add_ps (msum1, prod);
494 msum1 = _mm_hadd_ps (msum1, msum1);
495 msum1 = _mm_hadd_ps (msum1, msum1);
496 return _mm_cvtss_f32 (msum1);
500 float fvec_inner_product_ref (
const float * x,
506 for (i = 0; i < d; i++)
516 __m128 msum1 = _mm_setzero_ps();
519 mx = _mm_loadu_ps (x); x += 4;
520 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
524 mx = maskout (d, _mm_loadu_ps (x));
525 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
527 msum1 = _mm_hadd_ps (msum1, msum1);
528 msum1 = _mm_hadd_ps (msum1, msum1);
529 return _mm_cvtss_f32 (msum1);
533 float fvec_norm_L2sqr_ref (
const float * __restrict x,
538 for (i = 0; i < d; i++)
550 void fvec_inner_products_ny (
float * __restrict ip,
555 for (
size_t i = 0; i < ny; i++) {
556 ip[i] = fvec_inner_product (x, y, d);
565 void fvec_L2sqr_ny (
float * __restrict dis,
570 for (
size_t i = 0; i < ny; i++) {
580 void fvec_norms_L2 (
float * __restrict nr,
581 const float * __restrict x,
585 #pragma omp parallel for
586 for (
size_t i = 0; i < nx; i++) {
591 void fvec_norms_L2sqr (
float * __restrict nr,
592 const float * __restrict x,
595 #pragma omp parallel for
596 for (
size_t i = 0; i < nx; i++)
602 void fvec_renorm_L2 (
size_t d,
size_t nx,
float * __restrict x)
604 #pragma omp parallel for
605 for (
size_t i = 0; i < nx; i++) {
606 float * __restrict xi = x + i * d;
612 const float inv_nr = 1.0 / sqrtf (nr);
613 for (j = 0; j < d; j++)
642 static void knn_inner_product_sse (
const float * x,
644 size_t d,
size_t nx,
size_t ny,
645 float_minheap_array_t * res)
649 #pragma omp parallel for
650 for (
size_t i = 0; i < nx; i++) {
651 const float * x_ = x + i * d;
652 const float * y_ = y;
654 float * __restrict simi = res->get_val(i);
655 long * __restrict idxi = res->get_ids (i);
657 minheap_heapify (k, simi, idxi);
659 for (
size_t j = 0; j < ny; j++) {
660 float ip = fvec_inner_product (x_, y_, d);
663 minheap_pop (k, simi, idxi);
664 minheap_push (k, simi, idxi, ip, j);
668 minheap_reorder (k, simi, idxi);
673 static void knn_L2sqr_sse (
676 size_t d,
size_t nx,
size_t ny,
677 float_maxheap_array_t * res)
681 #pragma omp parallel for
682 for (
size_t i = 0; i < nx; i++) {
683 const float * x_ = x + i * d;
684 const float * y_ = y;
686 float * __restrict simi = res->get_val(i);
687 long * __restrict idxi = res->get_ids (i);
689 maxheap_heapify (k, simi, idxi);
690 for (j = 0; j < ny; j++) {
693 if (disij < simi[0]) {
694 maxheap_pop (k, simi, idxi);
695 maxheap_push (k, simi, idxi, disij, j);
699 maxheap_reorder (k, simi, idxi);
706 static void knn_inner_product_blas (
709 size_t d,
size_t nx,
size_t ny,
710 float_minheap_array_t * res)
715 if (nx == 0 || ny == 0)
return;
718 const size_t bs_x = 4096, bs_y = 1024;
720 float *ip_block =
new float[bs_x * bs_y];
722 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
723 size_t i1 = i0 + bs_x;
726 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
727 size_t j1 = j0 + bs_y;
728 if (j1 > ny) j1 = ny;
731 float one = 1, zero = 0;
732 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
733 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
735 x + i0 * d, &di, &zero,
740 res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
749 template<
class DistanceCorrection>
750 static void knn_L2sqr_blas (
const float * x,
752 size_t d,
size_t nx,
size_t ny,
753 float_maxheap_array_t * res,
754 const DistanceCorrection &corr)
759 if (nx == 0 || ny == 0)
return;
764 const size_t bs_x = 4096, bs_y = 1024;
766 float *ip_block =
new float[bs_x * bs_y];
768 float *x_norms =
new float[nx];
769 fvec_norms_L2sqr (x_norms, x, d, nx);
771 float *y_norms =
new float[ny];
772 fvec_norms_L2sqr (y_norms, y, d, ny);
774 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
775 size_t i1 = i0 + bs_x;
778 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
779 size_t j1 = j0 + bs_y;
780 if (j1 > ny) j1 = ny;
783 float one = 1, zero = 0;
784 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
785 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
787 x + i0 * d, &di, &zero,
792 #pragma omp parallel for
793 for (
size_t i = i0; i < i1; i++) {
794 float * __restrict simi = res->get_val(i);
795 long * __restrict idxi = res->get_ids (i);
796 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
798 for (
size_t j = j0; j < j1; j++) {
799 float ip = *ip_line++;
800 float dis = x_norms[i] + y_norms[j] - 2 * ip;
802 dis = corr (dis, i, j);
805 maxheap_pop (k, simi, idxi);
806 maxheap_push (k, simi, idxi, dis, j);
833 size_t d,
size_t nx,
size_t ny,
836 if (d % 4 == 0 && nx < 20) {
837 knn_inner_product_sse (x, y, d, nx, ny, res);
839 knn_inner_product_blas (x, y, d, nx, ny, res);
846 float operator()(
float dis,
size_t qno,
size_t bno)
const {
853 size_t d,
size_t nx,
size_t ny,
856 if (d % 4 == 0 && nx < 20) {
857 knn_L2sqr_sse (x, y, d, nx, ny, res);
860 knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
865 const float *base_shift;
866 float operator()(
float dis,
size_t qno,
size_t bno)
const {
867 return dis - base_shift[bno];
874 size_t d,
size_t nx,
size_t ny,
876 const float *base_shift)
879 knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
890 void fvec_inner_products_by_idx (
float * __restrict ip,
893 const long * __restrict ids,
894 size_t d,
size_t nx,
size_t ny)
896 #pragma omp parallel for
897 for (
size_t j = 0; j < nx; j++) {
898 const long * __restrict idsj = ids + j * ny;
899 const float * xj = x + j * d;
900 float * __restrict ipj = ip + j * ny;
901 for (
size_t i = 0; i < ny; i++) {
904 ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
911 void fvec_L2sqr_by_idx (
float * __restrict dis,
914 const long * __restrict ids,
915 size_t d,
size_t nx,
size_t ny)
917 #pragma omp parallel for
918 for (
size_t j = 0; j < nx; j++) {
919 const long * __restrict idsj = ids + j * ny;
920 const float * xj = x + j * d;
921 float * __restrict disj = dis + j * ny;
922 for (
size_t i = 0; i < ny; i++) {
925 disj[i] =
fvec_L2sqr (xj, y + d * idsj[i], d);
936 void knn_inner_products_by_idx (
const float * x,
938 const long * __restrict ids,
939 size_t d,
size_t nx,
size_t ny,
940 float_minheap_array_t * res)
946 #pragma omp parallel for
947 for (
size_t i = 0; i < nx; i++) {
948 const float * x_ = x + i * d;
949 const long * idsi = ids + i * ny;
951 float * __restrict simi = res->get_val(i);
952 long * __restrict idxi = res->get_ids (i);
953 minheap_heapify (k, simi, idxi);
955 for (j = 0; j < ny; j++) {
956 if (idsi[j] < 0)
break;
957 float ip = fvec_inner_product (x_, y + d * idsi[j], d);
960 minheap_pop (k, simi, idxi);
961 minheap_push (k, simi, idxi, ip, idsi[j]);
964 minheap_reorder (k, simi, idxi);
969 void knn_L2sqr_by_idx (
const float * x,
971 const long * __restrict ids,
972 size_t d,
size_t nx,
size_t ny,
973 float_maxheap_array_t * res)
977 #pragma omp parallel for
978 for (
size_t i = 0; i < nx; i++) {
979 const float * x_ = x + i * d;
980 const long * __restrict idsi = ids + i * ny;
981 float * __restrict simi = res->get_val(i);
982 long * __restrict idxi = res->get_ids (i);
983 maxheap_heapify (res->k, simi, idxi);
984 for (
size_t j = 0; j < ny; j++) {
985 float disij =
fvec_L2sqr (x_, y + d * idsi[j], d);
987 if (disij < simi[0]) {
988 maxheap_pop (k, simi, idxi);
989 maxheap_push (k, simi, idxi, disij, idsi[j]);
992 maxheap_reorder (res->k, simi, idxi);
1008 template <
bool compute_l2>
1009 static void range_search_blas (
1012 size_t d,
size_t nx,
size_t ny,
1014 RangeSearchResult *result)
1018 if (nx == 0 || ny == 0)
return;
1021 const size_t bs_x = 4096, bs_y = 1024;
1023 float *ip_block =
new float[bs_x * bs_y];
1025 float *x_norms =
nullptr, *y_norms =
nullptr;
1028 x_norms =
new float[nx];
1029 fvec_norms_L2sqr (x_norms, x, d, nx);
1030 y_norms =
new float[ny];
1031 fvec_norms_L2sqr (y_norms, y, d, ny);
1034 std::vector <RangeSearchPartialResult *> partial_results;
1036 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
1037 size_t j1 = j0 + bs_y;
1038 if (j1 > ny) j1 = ny;
1039 RangeSearchPartialResult * pres =
new RangeSearchPartialResult (result);
1040 partial_results.push_back (pres);
1042 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
1043 size_t i1 = i0 + bs_x;
1044 if(i1 > nx) i1 = nx;
1048 float one = 1, zero = 0;
1049 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1050 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
1052 x + i0 * d, &di, &zero,
1057 for (
size_t i = i0; i < i1; i++) {
1058 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1060 RangeSearchPartialResult::QueryResult & qres =
1061 pres->new_result (i);
1063 for (
size_t j = j0; j < j1; j++) {
1064 float ip = *ip_line++;
1066 float dis = x_norms[i] + y_norms[j] - 2 * ip;
1085 int npres = partial_results.size();
1087 for (
size_t i = 0; i < nx; i++) {
1088 for (
int j = 0; j < npres; j++)
1089 result->lims[i] += partial_results[j]->queries[i].nres;
1091 result->do_allocation ();
1092 for (
int j = 0; j < npres; j++) {
1093 partial_results[j]->set_result (
true);
1094 delete partial_results[j];
1098 for (
size_t i = nx; i > 0; i--) {
1099 result->lims [i] = result->lims [i - 1];
1101 result->lims [0] = 0;
1106 template <
bool compute_l2>
1107 static void range_search_sse (
const float * x,
1109 size_t d,
size_t nx,
size_t ny,
1111 RangeSearchResult *res)
1113 FAISS_ASSERT (d % 4 == 0);
1115 #pragma omp parallel
1117 RangeSearchPartialResult pres (res);
1120 for (
size_t i = 0; i < nx; i++) {
1121 const float * x_ = x + i * d;
1122 const float * y_ = y;
1125 RangeSearchPartialResult::QueryResult & qres =
1126 pres.new_result (i);
1128 for (j = 0; j < ny; j++) {
1131 if (disij < radius) {
1132 qres.add (disij, j);
1135 float ip = fvec_inner_product (x_, y_, d);
1155 size_t d,
size_t nx,
size_t ny,
1160 if (d % 4 == 0 && nx < 20) {
1161 range_search_sse<true> (x, y, d, nx, ny, radius, res);
1163 range_search_blas<true> (x, y, d, nx, ny, radius, res);
1170 size_t d,
size_t nx,
size_t ny,
1175 if (d % 4 == 0 && nx < 20) {
1176 range_search_sse<false> (x, y, d, nx, ny, radius, res);
1178 range_search_blas<false> (x, y, d, nx, ny, radius, res);
1192 void inner_product_to_L2sqr (
float * __restrict dis,
1195 size_t n1,
size_t n2)
1198 #pragma omp parallel for
1199 for (
size_t j = 0 ; j < n1 ; j++) {
1200 float * disj = dis + j * n2;
1201 for (
size_t i = 0 ; i < n2 ; i++)
1202 disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1209 FAISS_ASSERT(m >= n);
1210 FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1211 std::vector<float> tau (ki);
1212 FINTEGER lwork = -1, info;
1215 sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1216 &work_size, &lwork, &info);
1217 lwork = size_t(work_size);
1218 std::vector<float> work (lwork);
1220 sgeqrf_ (&mi, &ni, a, &mi,
1221 tau.data(), work.data(), &lwork, &info);
1223 sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1224 work.data(), &lwork, &info);
1230 long nq,
const float *xq,
1231 long nb,
const float *xb,
1233 long ldq,
long ldb,
long ldd)
1235 if (nq == 0 || nb == 0)
return;
1236 if (ldq == -1) ldq = d;
1237 if (ldb == -1) ldb = d;
1238 if (ldd == -1) ldd = nb;
1241 float *b_norms = dis;
1243 #pragma omp parallel for
1244 for (
long i = 0; i < nb; i++)
1247 #pragma omp parallel for
1248 for (
long i = 1; i < nq; i++) {
1250 for (
long j = 0; j < nb; j++)
1251 dis[i * ldd + j] = q_norm + b_norms [j];
1256 for (
long j = 0; j < nb; j++)
1261 FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1262 float one = 1.0, minus_2 = -2.0;
1264 sgemm_ (
"Transposed",
"Not transposed",
1288 size_t d,
size_t k,
size_t n)
1290 std::vector<size_t> hassign(k);
1291 memset (centroids, 0,
sizeof(*centroids) * d * k);
1294 #pragma omp parallel
1296 int nt = omp_get_num_threads();
1297 int rank = omp_get_thread_num();
1299 size_t c0 = (k * rank) / nt;
1300 size_t c1 = (k * (rank + 1)) / nt;
1301 const float *xi = x;
1305 for (
size_t i = 0; i < n; i++) {
1306 long ci = assign[i];
1307 assert (ci >= 0 && ci < k);
1308 if (ci >= c0 && ci < c1) {
1309 float * c = centroids + ci * d;
1311 for (
size_t j = 0; j < d; j++)
1321 #pragma omp parallel for
1322 for (
size_t ci = 0; ci < k; ci++) {
1323 float * c = centroids + ci * d;
1324 float ni = (float) hassign[ci];
1326 for (
size_t j = 0; j < d; j++)
1334 for (
size_t ci = 0; ci < k; ci++) {
1335 if (hassign[ci] == 0) {
1337 for (cj = 0; 1; cj = (cj+1) % k) {
1339 float p = (hassign[cj] - 1.0) / (float) (n - k);
1345 memcpy (centroids+ci*d, centroids+cj*d,
sizeof(*centroids) * d);
1348 for (
size_t j = 0; j < d; j++)
1350 centroids[ci * d + j] += EPS;
1351 centroids[cj * d + j] -= EPS;
1354 centroids[ci * d + j] -= EPS;
1355 centroids[cj * d + j] += EPS;
1359 hassign[ci] = hassign[cj] / 2;
1360 hassign[cj] -= hassign[ci];
1379 float prev_dis = -1e38;
1381 for (
int i = 0; i < k; i++) {
1382 if (dis[i] != prev_dis) {
1383 if (i > prev_i + 1) {
1385 std::sort (idx + prev_i, idx + i);
1394 size_t k2,
const long *v2_in)
1397 long *v2 =
new long [k2];
1398 memcpy (v2, v2_in,
sizeof (
long) * k2);
1399 std::sort (v2, v2 + k2);
1403 for (
size_t i = 0; i < k2; i++) {
1404 if (v2 [i] != prev) {
1405 v2[wp++] = prev = v2 [i];
1410 const long seen_flag = 1L << 60;
1412 for (
size_t i = 0; i < k1; i++) {
1414 size_t i0 = 0, i1 = k2;
1415 while (i0 + 1 < i1) {
1416 size_t imed = (i1 + i0) / 2;
1417 long piv = v2 [imed] & ~seen_flag;
1418 if (piv <= q) i0 = imed;
1423 v2 [i0] |= seen_flag;
1432 double tot = 0, uf = 0;
1434 for (
int i = 0 ; i < k ; i++) {
1436 uf += hist[i] * (double) hist[i];
1438 uf = uf * k / (tot * tot);
1445 std::vector<int> hist(k, 0);
1446 for (
int i = 0; i < n; i++) {
1455 int ivec_hist (
size_t n,
const int * v,
int vmax,
int *hist) {
1456 memset (hist, 0,
sizeof(hist[0]) * vmax);
1459 if (v[n] < 0 || v[n] >= vmax) nout++;
1468 FAISS_ASSERT (nbits % 8 == 0);
1469 size_t d = nbits / 8;
1470 std::vector<int> accu(d * 256);
1471 const uint8_t *c = codes;
1472 for (
size_t i = 0; i < n; i++)
1473 for(
int j = 0; j < d; j++)
1474 accu[j * 256 + *c++]++;
1475 memset (hist, 0,
sizeof(*hist) * nbits);
1476 for (
int i = 0; i < d; i++) {
1477 const int *ai = accu.data() + i * 256;
1478 int * hi = hist + i * 8;
1479 for (
int j = 0; j < 256; j++)
1480 for (
int k = 0; k < 8; k++)
1492 while (n--) cs = cs * 65713 + a[n] * 1686049;
1498 struct ArgsortComparator {
1500 bool operator() (
const size_t a,
const size_t b)
const {
1501 return vals[a] < vals[b];
1508 size_t len()
const {
1518 template<
typename T>
1519 void parallel_merge (
const T *src, T *dst,
1520 SegmentS &s1, SegmentS & s2,
int nt,
1521 const ArgsortComparator & comp) {
1522 if (s2.len() > s1.len()) {
1527 SegmentS s1s[nt], s2s[nt], sws[nt];
1529 s2s[nt - 1].i1 = s2.i1;
1532 #pragma omp parallel for num_threads(nt)
1533 for (
int t = 0; t < nt; t++) {
1534 s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1535 s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1538 T pivot = src[s1s[t].i1];
1539 size_t i0 = s2.i0, i1 = s2.i1;
1540 while (i0 + 1 < i1) {
1541 size_t imed = (i1 + i0) / 2;
1542 if (comp (pivot, src[imed])) {i1 = imed; }
1545 s2s[t].i1 = s2s[t + 1].i0 = i1;
1548 s1.i0 = std::min(s1.i0, s2.i0);
1549 s1.i1 = std::max(s1.i1, s2.i1);
1552 for (
int t = 0; t < nt; t++) {
1553 sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1555 sws[t + 1].i0 = sws[t].i1;
1558 assert(sws[nt - 1].i1 == s1.i1);
1561 #pragma omp parallel for num_threads(nt)
1562 for (
int t = 0; t < nt; t++) {
1563 SegmentS sw = sws[t];
1564 SegmentS s1t = s1s[t];
1565 SegmentS s2t = s2s[t];
1566 if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1569 if (comp(src[s1t.i0], src[s2t.i0])) {
1570 dst[sw.i0++] = src[s1t.i0++];
1571 if (s1t.i0 == s1t.i1)
break;
1573 dst[sw.i0++] = src[s2t.i0++];
1574 if (s2t.i0 == s2t.i1)
break;
1578 if (s1t.len() > 0) {
1579 assert(s1t.len() == sw.len());
1580 memcpy(dst + sw.i0, src + s1t.i0, s1t.len() *
sizeof(dst[0]));
1581 }
else if (s2t.len() > 0) {
1582 assert(s2t.len() == sw.len());
1583 memcpy(dst + sw.i0, src + s2t.i0, s2t.len() *
sizeof(dst[0]));
1590 void fvec_argsort (
size_t n,
const float *vals,
1593 for (
size_t i = 0; i < n; i++) perm[i] = i;
1594 ArgsortComparator comp = {vals};
1595 std::sort (perm, perm + n, comp);
1598 void fvec_argsort_parallel (
size_t n,
const float *vals,
1601 size_t * perm2 =
new size_t[n];
1603 size_t *permB = perm2, *permA = perm;
1605 int nt = omp_get_max_threads();
1610 nseg = (nseg + 1) / 2;
1611 std::swap (permA, permB);
1615 #pragma omp parallel
1616 for (
size_t i = 0; i < n; i++) permA[i] = i;
1618 ArgsortComparator comp = {vals};
1623 #pragma omp parallel for
1624 for (
int t = 0; t < nt; t++) {
1625 size_t i0 = t * n / nt;
1626 size_t i1 = (t + 1) * n / nt;
1627 SegmentS seg = {i0, i1};
1628 std::sort (permA + seg.i0, permA + seg.i1, comp);
1631 int prev_nested = omp_get_nested();
1636 int nseg1 = (nseg + 1) / 2;
1637 int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1638 int sub_nseg1 = nseg / 2;
1640 #pragma omp parallel for num_threads(nseg1)
1641 for (
int s = 0; s < nseg; s += 2) {
1642 if (s + 1 == nseg) {
1643 memcpy(permB + segs[s].i0, permA + segs[s].i0,
1644 segs[s].len() *
sizeof(
size_t));
1646 int t0 = s * sub_nt / sub_nseg1;
1647 int t1 = (s + 1) * sub_nt / sub_nseg1;
1648 printf(
"merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1649 parallel_merge(permA, permB, segs[s], segs[s + 1],
1653 for (
int s = 0; s < nseg; s += 2)
1654 segs[s / 2] = segs[s];
1656 std::swap (permA, permB);
1658 assert (permA == perm);
1659 omp_set_nested(prev_nested);
1683 static inline void fvec_madd_ref (
size_t n,
const float *a,
1684 float bf,
const float *b,
float *c) {
1685 for (
size_t i = 0; i < n; i++)
1686 c[i] = a[i] + bf * b[i];
1690 static inline void fvec_madd_sse (
size_t n,
const float *a,
1691 float bf,
const float *b,
float *c) {
1693 __m128 bf4 = _mm_set_ps1 (bf);
1694 __m128 * a4 = (__m128*)a;
1695 __m128 * b4 = (__m128*)b;
1696 __m128 * c4 = (__m128*)c;
1699 *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1707 float bf,
const float *b,
float *c)
1710 ((((
long)a) | ((
long)b) | ((
long)c)) & 15) == 0)
1711 fvec_madd_sse (n, a, bf, b, c);
1713 fvec_madd_ref (n, a, bf, b, c);
1716 static inline int fvec_madd_and_argmin_ref (
size_t n,
const float *a,
1717 float bf,
const float *b,
float *c) {
1721 for (
size_t i = 0; i < n; i++) {
1722 c[i] = a[i] + bf * b[i];
1731 static inline int fvec_madd_and_argmin_sse (
size_t n,
const float *a,
1732 float bf,
const float *b,
float *c) {
1734 __m128 bf4 = _mm_set_ps1 (bf);
1735 __m128 vmin4 = _mm_set_ps1 (1e20);
1736 __m128i imin4 = _mm_set1_epi32 (-1);
1737 __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1738 __m128i inc4 = _mm_set1_epi32 (4);
1739 __m128 * a4 = (__m128*)a;
1740 __m128 * b4 = (__m128*)b;
1741 __m128 * c4 = (__m128*)c;
1744 __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1746 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1749 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1750 _mm_andnot_si128 (mask, imin4));
1751 vmin4 = _mm_min_ps (vmin4, vc4);
1755 idx4 = _mm_add_epi32 (idx4, inc4);
1760 idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1761 __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1762 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1763 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1764 _mm_andnot_si128 (mask, imin4));
1765 vmin4 = _mm_min_ps (vmin4, vc4);
1769 idx4 = _mm_shuffle_epi32 (imin4, 1);
1770 __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1771 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1772 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1773 _mm_andnot_si128 (mask, imin4));
1776 return _mm_extract_epi32 (imin4, 0);
1781 float bf,
const float *b,
float *c)
1784 ((((
long)a) | ((
long)b) | ((
long)c)) & 15) == 0)
1785 return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1787 return fvec_madd_and_argmin_ref (n, a, bf, b, c);
random generator that can be used in multithreaded contexts
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
RandomGenerator(long seed=1234)
initialize
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n)
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
void ranklist_handle_ties(int k, long *idx, const float *dis)
float rand_float()
between 0 and 1
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
long rand_long()
random long < 2 ^ 62
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
double getmillisecs()
ms elapsed since some arbitrary epoch
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
float fvec_norm_L2sqr(const float *x, size_t d)
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
void matrix_qr(int m, int n, float *a)
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)