19 #include <immintrin.h>
23 #include <sys/types.h>
32 #include "AuxIndexStructures.h"
33 #include "FaissAssert.h"
46 int sgemm_ (
const char *transa,
const char *transb, FINTEGER *m, FINTEGER *
47 n, FINTEGER *k,
const float *alpha,
const float *a,
48 FINTEGER *lda,
const float *b, FINTEGER *
49 ldb,
float *beta,
float *c, FINTEGER *ldc);
53 int sgeqrf_ (FINTEGER *m, FINTEGER *n,
float *a, FINTEGER *lda,
54 float *tau,
float *work, FINTEGER *lwork, FINTEGER *info);
56 int sorgqr_(FINTEGER *m, FINTEGER *n, FINTEGER *k,
float *a,
57 FINTEGER *lda,
float *tau,
float *work,
58 FINTEGER *lwork, FINTEGER *info);
76 gettimeofday (&tv,
nullptr);
77 return tv.tv_sec * 1e3 + tv.tv_usec * 1e-3;
87 snprintf (fname, 256,
"/proc/%d/status", pid);
88 FILE * f = fopen (fname,
"r");
89 FAISS_THROW_IF_NOT_MSG (f,
"cannot open proc status file");
93 if (!fgets (buf, 256, f))
break;
94 if (sscanf (buf,
"VmRSS: %ld kB", &sz) == 1)
break;
104 fprintf(stderr,
"WARN: get_mem_usage_kb not implemented on the mac\n");
142 random_r (&rand_data, &a);
149 random_r (&rand_data, &a);
150 random_r (&rand_data, &b);
151 return long(a) | long(b) << 31;
157 memset (&rand_data, 0,
sizeof (rand_data));
158 initstate_r (seed, rand_state,
sizeof (rand_state), &rand_data);
164 memcpy (rand_state, other.rand_state,
sizeof(rand_state));
165 rand_data = other.rand_data;
166 setstate_r (rand_state, &rand_data);
173 int rand_r(
unsigned *seed);
184 rand_state = other.rand_state;
192 int lowbits = rand_r(&rand_state) >> 15;
193 return rand_r(&rand_state) ^ lowbits;
198 return long(random()) | long(random()) << 31;
213 return rand_int() / float(1L << 31);
216 double RandomGenerator::rand_double ()
230 void float_rand (
float * x,
size_t n,
long seed)
233 const size_t nblock = n < 1024 ? 1 : 1024;
235 RandomGenerator rng0 (seed);
236 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
238 #pragma omp parallel for
239 for (
size_t j = 0; j < nblock; j++) {
241 RandomGenerator rng (a0 + j * b0);
243 const size_t istart = j * n / nblock;
244 const size_t iend = (j + 1) * n / nblock;
246 for (
size_t i = istart; i < iend; i++)
247 x[i] = rng.rand_float ();
252 void float_randn (
float * x,
size_t n,
long seed)
255 const size_t nblock = n < 1024 ? 1 : 1024;
257 RandomGenerator rng0 (seed);
258 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
260 #pragma omp parallel for
261 for (
size_t j = 0; j < nblock; j++) {
262 RandomGenerator rng (a0 + j * b0);
264 double a = 0, b = 0, s = 0;
267 const size_t istart = j * n / nblock;
268 const size_t iend = (j + 1) * n / nblock;
270 for (
size_t i = istart; i < iend; i++) {
274 a = 2.0 * rng.rand_double () - 1;
275 b = 2.0 * rng.rand_double () - 1;
278 x[i] = a * sqrt(-2.0 * log(s) / s);
281 x[i] = b * sqrt(-2.0 * log(s) / s);
289 void long_rand (
long * x,
size_t n,
long seed)
292 const size_t nblock = n < 1024 ? 1 : 1024;
294 RandomGenerator rng0 (seed);
295 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
297 #pragma omp parallel for
298 for (
size_t j = 0; j < nblock; j++) {
300 RandomGenerator rng (a0 + j * b0);
302 const size_t istart = j * n / nblock;
303 const size_t iend = (j + 1) * n / nblock;
304 for (
size_t i = istart; i < iend; i++)
305 x[i] = rng.rand_long ();
311 void rand_perm (
int *perm,
size_t n,
long seed)
313 for (
size_t i = 0; i < n; i++) perm[i] = i;
315 RandomGenerator rng (seed);
317 for (
size_t i = 0; i + 1 < n; i++) {
318 int i2 = i + rng.rand_int (n - i);
319 std::swap(perm[i], perm[i2]);
326 void byte_rand (uint8_t * x,
size_t n,
long seed)
329 const size_t nblock = n < 1024 ? 1 : 1024;
331 RandomGenerator rng0 (seed);
332 int a0 = rng0.rand_int (), b0 = rng0.rand_int ();
334 #pragma omp parallel for
335 for (
size_t j = 0; j < nblock; j++) {
337 RandomGenerator rng (a0 + j * b0);
339 const size_t istart = j * n / nblock;
340 const size_t iend = (j + 1) * n / nblock;
343 for (i = istart; i < iend; i++)
344 x[i] = rng.rand_long ();
350 void reflection (
const float * __restrict u,
351 float * __restrict x,
352 size_t n,
size_t d,
size_t nu)
355 for (i = 0; i < n; i++) {
356 const float * up = u;
357 for (l = 0; l < nu; l++) {
358 float ip1 = 0, ip2 = 0;
360 for (j = 0; j < d; j+=2) {
362 ip2 += up[j+1] * x[j+1];
364 float ip = 2 * (ip1 + ip2);
366 for (j = 0; j < d; j++)
376 void reflection_ref (
const float * u,
float * x,
size_t n,
size_t d,
size_t nu)
379 for (i = 0; i < n; i++) {
380 const float * up = u;
381 for (l = 0; l < nu; l++) {
384 for (j = 0; j < d; j++)
388 for (j = 0; j < d; j++)
426 float fvec_L2sqr_ref (
const float * x,
432 for (i = 0; i < d; i++) {
433 const float tmp = x[i] - y[i];
439 float fvec_inner_product_ref (
const float * x,
445 for (i = 0; i < d; i++)
450 float fvec_norm_L2sqr_ref (
const float * __restrict x,
455 for (i = 0; i < d; i++)
466 static inline __m128 masked_read (
int d,
const float *x)
468 assert (0 <= d && d < 4);
469 __attribute__((__aligned__(16))) float buf[4] = {0, 0, 0, 0};
478 return _mm_load_ps (buf);
485 static inline __m256 masked_read_8 (
int d,
const float *x)
487 assert (0 <= d && d < 8);
489 __m256 res = _mm256_setzero_ps ();
490 res = _mm256_insertf128_ps (res, masked_read (d, x), 0);
493 __m256 res = _mm256_setzero_ps ();
494 res = _mm256_insertf128_ps (res, _mm_loadu_ps (x), 0);
495 res = _mm256_insertf128_ps (res, masked_read (d - 4, x + 4), 1);
500 float fvec_inner_product (
const float * x,
504 __m256 msum1 = _mm256_setzero_ps();
507 __m256 mx = _mm256_loadu_ps (x); x += 8;
508 __m256 my = _mm256_loadu_ps (y); y += 8;
509 msum1 = _mm256_add_ps (msum1, _mm256_mul_ps (mx, my));
513 __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
514 msum2 += _mm256_extractf128_ps(msum1, 0);
517 __m128 mx = _mm_loadu_ps (x); x += 4;
518 __m128 my = _mm_loadu_ps (y); y += 4;
519 msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
524 __m128 mx = masked_read (d, x);
525 __m128 my = masked_read (d, y);
526 msum2 = _mm_add_ps (msum2, _mm_mul_ps (mx, my));
529 msum2 = _mm_hadd_ps (msum2, msum2);
530 msum2 = _mm_hadd_ps (msum2, msum2);
531 return _mm_cvtss_f32 (msum2);
538 __m256 msum1 = _mm256_setzero_ps();
541 __m256 mx = _mm256_loadu_ps (x); x += 8;
542 __m256 my = _mm256_loadu_ps (y); y += 8;
543 const __m256 a_m_b1 = mx - my;
544 msum1 += a_m_b1 * a_m_b1;
548 __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
549 msum2 += _mm256_extractf128_ps(msum1, 0);
552 __m128 mx = _mm_loadu_ps (x); x += 4;
553 __m128 my = _mm_loadu_ps (y); y += 4;
554 const __m128 a_m_b1 = mx - my;
555 msum2 += a_m_b1 * a_m_b1;
560 __m128 mx = masked_read (d, x);
561 __m128 my = masked_read (d, y);
562 __m128 a_m_b1 = mx - my;
563 msum2 += a_m_b1 * a_m_b1;
566 msum2 = _mm_hadd_ps (msum2, msum2);
567 msum2 = _mm_hadd_ps (msum2, msum2);
568 return _mm_cvtss_f32 (msum2);
578 __m128 msum1 = _mm_setzero_ps();
581 __m128 mx = _mm_loadu_ps (x); x += 4;
582 __m128 my = _mm_loadu_ps (y); y += 4;
583 const __m128 a_m_b1 = mx - my;
584 msum1 += a_m_b1 * a_m_b1;
590 __m128 mx = masked_read (d, x);
591 __m128 my = masked_read (d, y);
592 __m128 a_m_b1 = mx - my;
593 msum1 += a_m_b1 * a_m_b1;
596 msum1 = _mm_hadd_ps (msum1, msum1);
597 msum1 = _mm_hadd_ps (msum1, msum1);
598 return _mm_cvtss_f32 (msum1);
602 float fvec_inner_product (
const float * x,
607 __m128 msum1 = _mm_setzero_ps();
610 mx = _mm_loadu_ps (x); x += 4;
611 my = _mm_loadu_ps (y); y += 4;
612 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, my));
617 mx = masked_read (d, x);
618 my = masked_read (d, y);
619 __m128 prod = _mm_mul_ps (mx, my);
621 msum1 = _mm_add_ps (msum1, prod);
623 msum1 = _mm_hadd_ps (msum1, msum1);
624 msum1 = _mm_hadd_ps (msum1, msum1);
625 return _mm_cvtss_f32 (msum1);
636 __m128 msum1 = _mm_setzero_ps();
639 mx = _mm_loadu_ps (x); x += 4;
640 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
644 mx = masked_read (d, x);
645 msum1 = _mm_add_ps (msum1, _mm_mul_ps (mx, mx));
647 msum1 = _mm_hadd_ps (msum1, msum1);
648 msum1 = _mm_hadd_ps (msum1, msum1);
649 return _mm_cvtss_f32 (msum1);
665 void fvec_inner_products_ny (
float * __restrict ip,
670 for (
size_t i = 0; i < ny; i++) {
671 ip[i] = fvec_inner_product (x, y, d);
680 void fvec_L2sqr_ny (
float * __restrict dis,
685 for (
size_t i = 0; i < ny; i++) {
695 void fvec_norms_L2 (
float * __restrict nr,
696 const float * __restrict x,
700 #pragma omp parallel for
701 for (
size_t i = 0; i < nx; i++) {
706 void fvec_norms_L2sqr (
float * __restrict nr,
707 const float * __restrict x,
710 #pragma omp parallel for
711 for (
size_t i = 0; i < nx; i++)
717 void fvec_renorm_L2 (
size_t d,
size_t nx,
float * __restrict x)
719 #pragma omp parallel for
720 for (
size_t i = 0; i < nx; i++) {
721 float * __restrict xi = x + i * d;
727 const float inv_nr = 1.0 / sqrtf (nr);
728 for (j = 0; j < d; j++)
757 static void knn_inner_product_sse (
const float * x,
759 size_t d,
size_t nx,
size_t ny,
760 float_minheap_array_t * res)
764 #pragma omp parallel for
765 for (
size_t i = 0; i < nx; i++) {
766 const float * x_ = x + i * d;
767 const float * y_ = y;
769 float * __restrict simi = res->get_val(i);
770 long * __restrict idxi = res->get_ids (i);
772 minheap_heapify (k, simi, idxi);
774 for (
size_t j = 0; j < ny; j++) {
775 float ip = fvec_inner_product (x_, y_, d);
778 minheap_pop (k, simi, idxi);
779 minheap_push (k, simi, idxi, ip, j);
783 minheap_reorder (k, simi, idxi);
788 static void knn_L2sqr_sse (
791 size_t d,
size_t nx,
size_t ny,
792 float_maxheap_array_t * res)
796 #pragma omp parallel for
797 for (
size_t i = 0; i < nx; i++) {
798 const float * x_ = x + i * d;
799 const float * y_ = y;
801 float * __restrict simi = res->get_val(i);
802 long * __restrict idxi = res->get_ids (i);
804 maxheap_heapify (k, simi, idxi);
805 for (j = 0; j < ny; j++) {
808 if (disij < simi[0]) {
809 maxheap_pop (k, simi, idxi);
810 maxheap_push (k, simi, idxi, disij, j);
814 maxheap_reorder (k, simi, idxi);
821 static void knn_inner_product_blas (
824 size_t d,
size_t nx,
size_t ny,
825 float_minheap_array_t * res)
830 if (nx == 0 || ny == 0)
return;
833 const size_t bs_x = 4096, bs_y = 1024;
835 float *ip_block =
new float[bs_x * bs_y];
837 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
838 size_t i1 = i0 + bs_x;
841 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
842 size_t j1 = j0 + bs_y;
843 if (j1 > ny) j1 = ny;
846 float one = 1, zero = 0;
847 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
848 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
850 x + i0 * d, &di, &zero,
855 res->addn (j1 - j0, ip_block, j0, i0, i1 - i0);
864 template<
class DistanceCorrection>
865 static void knn_L2sqr_blas (
const float * x,
867 size_t d,
size_t nx,
size_t ny,
868 float_maxheap_array_t * res,
869 const DistanceCorrection &corr)
874 if (nx == 0 || ny == 0)
return;
879 const size_t bs_x = 4096, bs_y = 1024;
881 float *ip_block =
new float[bs_x * bs_y];
883 float *x_norms =
new float[nx];
884 fvec_norms_L2sqr (x_norms, x, d, nx);
886 float *y_norms =
new float[ny];
887 fvec_norms_L2sqr (y_norms, y, d, ny);
889 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
890 size_t i1 = i0 + bs_x;
893 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
894 size_t j1 = j0 + bs_y;
895 if (j1 > ny) j1 = ny;
898 float one = 1, zero = 0;
899 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
900 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
902 x + i0 * d, &di, &zero,
907 #pragma omp parallel for
908 for (
size_t i = i0; i < i1; i++) {
909 float * __restrict simi = res->get_val(i);
910 long * __restrict idxi = res->get_ids (i);
911 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
913 for (
size_t j = j0; j < j1; j++) {
914 float ip = *ip_line++;
915 float dis = x_norms[i] + y_norms[j] - 2 * ip;
917 dis = corr (dis, i, j);
920 maxheap_pop (k, simi, idxi);
921 maxheap_push (k, simi, idxi, dis, j);
946 int distance_compute_blas_threshold = 20;
950 size_t d,
size_t nx,
size_t ny,
953 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
954 knn_inner_product_sse (x, y, d, nx, ny, res);
956 knn_inner_product_blas (x, y, d, nx, ny, res);
963 float operator()(
float dis,
size_t ,
size_t )
const {
970 size_t d,
size_t nx,
size_t ny,
973 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
974 knn_L2sqr_sse (x, y, d, nx, ny, res);
977 knn_L2sqr_blas (x, y, d, nx, ny, res, nop);
982 const float *base_shift;
983 float operator()(
float dis,
size_t ,
size_t bno)
const {
984 return dis - base_shift[bno];
991 size_t d,
size_t nx,
size_t ny,
993 const float *base_shift)
996 knn_L2sqr_blas (x, y, d, nx, ny, res, corr);
1007 void fvec_inner_products_by_idx (
float * __restrict ip,
1010 const long * __restrict ids,
1011 size_t d,
size_t nx,
size_t ny)
1013 #pragma omp parallel for
1014 for (
size_t j = 0; j < nx; j++) {
1015 const long * __restrict idsj = ids + j * ny;
1016 const float * xj = x + j * d;
1017 float * __restrict ipj = ip + j * ny;
1018 for (
size_t i = 0; i < ny; i++) {
1021 ipj[i] = fvec_inner_product (xj, y + d * idsj[i], d);
1028 void fvec_L2sqr_by_idx (
float * __restrict dis,
1031 const long * __restrict ids,
1032 size_t d,
size_t nx,
size_t ny)
1034 #pragma omp parallel for
1035 for (
size_t j = 0; j < nx; j++) {
1036 const long * __restrict idsj = ids + j * ny;
1037 const float * xj = x + j * d;
1038 float * __restrict disj = dis + j * ny;
1039 for (
size_t i = 0; i < ny; i++) {
1042 disj[i] =
fvec_L2sqr (xj, y + d * idsj[i], d);
1053 void knn_inner_products_by_idx (
const float * x,
1056 size_t d,
size_t nx,
size_t ny,
1057 float_minheap_array_t * res)
1061 #pragma omp parallel for
1062 for (
size_t i = 0; i < nx; i++) {
1063 const float * x_ = x + i * d;
1064 const long * idsi = ids + i * ny;
1066 float * __restrict simi = res->get_val(i);
1067 long * __restrict idxi = res->get_ids (i);
1068 minheap_heapify (k, simi, idxi);
1070 for (j = 0; j < ny; j++) {
1071 if (idsi[j] < 0)
break;
1072 float ip = fvec_inner_product (x_, y + d * idsi[j], d);
1075 minheap_pop (k, simi, idxi);
1076 minheap_push (k, simi, idxi, ip, idsi[j]);
1079 minheap_reorder (k, simi, idxi);
1084 void knn_L2sqr_by_idx (
const float * x,
1086 const long * __restrict ids,
1087 size_t d,
size_t nx,
size_t ny,
1088 float_maxheap_array_t * res)
1092 #pragma omp parallel for
1093 for (
size_t i = 0; i < nx; i++) {
1094 const float * x_ = x + i * d;
1095 const long * __restrict idsi = ids + i * ny;
1096 float * __restrict simi = res->get_val(i);
1097 long * __restrict idxi = res->get_ids (i);
1098 maxheap_heapify (res->k, simi, idxi);
1099 for (
size_t j = 0; j < ny; j++) {
1100 float disij =
fvec_L2sqr (x_, y + d * idsi[j], d);
1102 if (disij < simi[0]) {
1103 maxheap_pop (k, simi, idxi);
1104 maxheap_push (k, simi, idxi, disij, idsi[j]);
1107 maxheap_reorder (res->k, simi, idxi);
1123 template <
bool compute_l2>
1124 static void range_search_blas (
1127 size_t d,
size_t nx,
size_t ny,
1129 RangeSearchResult *result)
1133 if (nx == 0 || ny == 0)
return;
1136 const size_t bs_x = 4096, bs_y = 1024;
1138 float *ip_block =
new float[bs_x * bs_y];
1140 float *x_norms =
nullptr, *y_norms =
nullptr;
1143 x_norms =
new float[nx];
1144 fvec_norms_L2sqr (x_norms, x, d, nx);
1145 y_norms =
new float[ny];
1146 fvec_norms_L2sqr (y_norms, y, d, ny);
1149 std::vector <RangeSearchPartialResult *> partial_results;
1151 for (
size_t j0 = 0; j0 < ny; j0 += bs_y) {
1152 size_t j1 = j0 + bs_y;
1153 if (j1 > ny) j1 = ny;
1154 RangeSearchPartialResult * pres =
new RangeSearchPartialResult (result);
1155 partial_results.push_back (pres);
1157 for (
size_t i0 = 0; i0 < nx; i0 += bs_x) {
1158 size_t i1 = i0 + bs_x;
1159 if(i1 > nx) i1 = nx;
1163 float one = 1, zero = 0;
1164 FINTEGER nyi = j1 - j0, nxi = i1 - i0, di = d;
1165 sgemm_ (
"Transpose",
"Not transpose", &nyi, &nxi, &di, &one,
1167 x + i0 * d, &di, &zero,
1172 for (
size_t i = i0; i < i1; i++) {
1173 const float *ip_line = ip_block + (i - i0) * (j1 - j0);
1175 RangeSearchPartialResult::QueryResult & qres =
1176 pres->new_result (i);
1178 for (
size_t j = j0; j < j1; j++) {
1179 float ip = *ip_line++;
1181 float dis = x_norms[i] + y_norms[j] - 2 * ip;
1200 int npres = partial_results.size();
1202 for (
size_t i = 0; i < nx; i++) {
1203 for (
int j = 0; j < npres; j++)
1204 result->lims[i] += partial_results[j]->queries[i].nres;
1206 result->do_allocation ();
1207 for (
int j = 0; j < npres; j++) {
1208 partial_results[j]->set_result (
true);
1209 delete partial_results[j];
1213 for (
size_t i = nx; i > 0; i--) {
1214 result->lims [i] = result->lims [i - 1];
1216 result->lims [0] = 0;
1221 template <
bool compute_l2>
1222 static void range_search_sse (
const float * x,
1224 size_t d,
size_t nx,
size_t ny,
1226 RangeSearchResult *res)
1228 FAISS_THROW_IF_NOT (d % 4 == 0);
1230 #pragma omp parallel
1232 RangeSearchPartialResult pres (res);
1235 for (
size_t i = 0; i < nx; i++) {
1236 const float * x_ = x + i * d;
1237 const float * y_ = y;
1240 RangeSearchPartialResult::QueryResult & qres =
1241 pres.new_result (i);
1243 for (j = 0; j < ny; j++) {
1246 if (disij < radius) {
1247 qres.add (disij, j);
1250 float ip = fvec_inner_product (x_, y_, d);
1270 size_t d,
size_t nx,
size_t ny,
1275 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
1276 range_search_sse<true> (x, y, d, nx, ny, radius, res);
1278 range_search_blas<true> (x, y, d, nx, ny, radius, res);
1285 size_t d,
size_t nx,
size_t ny,
1290 if (d % 4 == 0 && nx < distance_compute_blas_threshold) {
1291 range_search_sse<false> (x, y, d, nx, ny, radius, res);
1293 range_search_blas<false> (x, y, d, nx, ny, radius, res);
1307 void inner_product_to_L2sqr (
float * __restrict dis,
1310 size_t n1,
size_t n2)
1313 #pragma omp parallel for
1314 for (
size_t j = 0 ; j < n1 ; j++) {
1315 float * disj = dis + j * n2;
1316 for (
size_t i = 0 ; i < n2 ; i++)
1317 disj[i] = nr1[j] + nr2[i] - 2 * disj[i];
1324 FAISS_THROW_IF_NOT (m >= n);
1325 FINTEGER mi = m, ni = n, ki = mi < ni ? mi : ni;
1326 std::vector<float> tau (ki);
1327 FINTEGER lwork = -1, info;
1330 sgeqrf_ (&mi, &ni, a, &mi, tau.data(),
1331 &work_size, &lwork, &info);
1332 lwork = size_t(work_size);
1333 std::vector<float> work (lwork);
1335 sgeqrf_ (&mi, &ni, a, &mi,
1336 tau.data(), work.data(), &lwork, &info);
1338 sorgqr_ (&mi, &ni, &ki, a, &mi, tau.data(),
1339 work.data(), &lwork, &info);
1345 long nq,
const float *xq,
1346 long nb,
const float *xb,
1348 long ldq,
long ldb,
long ldd)
1350 if (nq == 0 || nb == 0)
return;
1351 if (ldq == -1) ldq = d;
1352 if (ldb == -1) ldb = d;
1353 if (ldd == -1) ldd = nb;
1356 float *b_norms = dis;
1358 #pragma omp parallel for
1359 for (
long i = 0; i < nb; i++)
1362 #pragma omp parallel for
1363 for (
long i = 1; i < nq; i++) {
1365 for (
long j = 0; j < nb; j++)
1366 dis[i * ldd + j] = q_norm + b_norms [j];
1371 for (
long j = 0; j < nb; j++)
1376 FINTEGER nbi = nb, nqi = nq, di = d, ldqi = ldq, ldbi = ldb, lddi = ldd;
1377 float one = 1.0, minus_2 = -2.0;
1379 sgemm_ (
"Transposed",
"Not transposed",
1398 #define EPS (1 / 1024.)
1404 size_t d,
size_t k,
size_t n,
1408 centroids += k_frozen * d;
1410 std::vector<size_t> hassign(k);
1411 memset (centroids, 0,
sizeof(*centroids) * d * k);
1413 #pragma omp parallel
1415 int nt = omp_get_num_threads();
1416 int rank = omp_get_thread_num();
1418 size_t c0 = (k * rank) / nt;
1419 size_t c1 = (k * (rank + 1)) / nt;
1420 const float *xi = x;
1423 for (
size_t i = 0; i < n; i++) {
1424 long ci = assign[i];
1425 assert (ci >= 0 && ci < k + k_frozen);
1427 if (ci >= c0 && ci < c1) {
1428 float * c = centroids + ci * d;
1430 for (
size_t j = 0; j < d; j++)
1439 #pragma omp parallel for
1440 for (
size_t ci = 0; ci < k; ci++) {
1441 float * c = centroids + ci * d;
1442 float ni = (float) hassign[ci];
1444 for (
size_t j = 0; j < d; j++)
1452 for (
size_t ci = 0; ci < k; ci++) {
1453 if (hassign[ci] == 0) {
1455 for (cj = 0; 1; cj = (cj + 1) % k) {
1457 float p = (hassign[cj] - 1.0) / (float) (n - k);
1463 memcpy (centroids+ci*d, centroids+cj*d,
sizeof(*centroids) * d);
1466 for (
size_t j = 0; j < d; j++) {
1468 centroids[ci * d + j] *= 1 + EPS;
1469 centroids[cj * d + j] *= 1 - EPS;
1471 centroids[ci * d + j] *= 1 - EPS;
1472 centroids[cj * d + j] *= 1 + EPS;
1477 hassign[ci] = hassign[cj] / 2;
1478 hassign[cj] -= hassign[ci];
1497 float prev_dis = -1e38;
1499 for (
int i = 0; i < k; i++) {
1500 if (dis[i] != prev_dis) {
1501 if (i > prev_i + 1) {
1503 std::sort (idx + prev_i, idx + i);
1512 long *I0,
float *D0,
1513 const long *I1,
const float *D1,
1519 #pragma omp parallel reduction(+:n1)
1521 std::vector<long> tmpI (k);
1522 std::vector<float> tmpD (k);
1525 for (
size_t i = 0; i < n; i++) {
1526 long *lI0 = I0 + i * k;
1527 float *lD0 = D0 + i * k;
1528 const long *lI1 = I1 + i * k;
1529 const float *lD1 = D1 + i * k;
1534 for (
size_t j = 0; j < k; j++) {
1536 if (lI0[r0] >= 0 && lD0[r0] < lD1[r1]) {
1540 }
else if (lD1[r1] >= 0) {
1542 tmpI[j] = lI1[r1] + translation;
1550 for (
size_t j = 0; j < k; j++) {
1551 if (lI0[r0] >= 0 && lD0[r0] > lD1[r1]) {
1555 }
else if (lD1[r1] >= 0) {
1557 tmpI[j] = lI1[r1] + translation;
1566 memcpy (lD0, tmpD.data(),
sizeof (lD0[0]) * k);
1567 memcpy (lI0, tmpI.data(),
sizeof (lI0[0]) * k);
1577 size_t k2,
const long *v2_in)
1580 long *v2 =
new long [k2];
1581 memcpy (v2, v2_in,
sizeof (
long) * k2);
1582 std::sort (v2, v2 + k2);
1586 for (
size_t i = 0; i < k2; i++) {
1587 if (v2 [i] != prev) {
1588 v2[wp++] = prev = v2 [i];
1593 const long seen_flag = 1L << 60;
1595 for (
size_t i = 0; i < k1; i++) {
1597 size_t i0 = 0, i1 = k2;
1598 while (i0 + 1 < i1) {
1599 size_t imed = (i1 + i0) / 2;
1600 long piv = v2 [imed] & ~seen_flag;
1601 if (piv <= q) i0 = imed;
1606 v2 [i0] |= seen_flag;
1615 double tot = 0, uf = 0;
1617 for (
int i = 0 ; i < k ; i++) {
1619 uf += hist[i] * (double) hist[i];
1621 uf = uf * k / (tot * tot);
1628 std::vector<int> hist(k, 0);
1629 for (
int i = 0; i < n; i++) {
1638 int ivec_hist (
size_t n,
const int * v,
int vmax,
int *hist) {
1639 memset (hist, 0,
sizeof(hist[0]) * vmax);
1642 if (v[n] < 0 || v[n] >= vmax) nout++;
1651 FAISS_THROW_IF_NOT (nbits % 8 == 0);
1652 size_t d = nbits / 8;
1653 std::vector<int> accu(d * 256);
1654 const uint8_t *c = codes;
1655 for (
size_t i = 0; i < n; i++)
1656 for(
int j = 0; j < d; j++)
1657 accu[j * 256 + *c++]++;
1658 memset (hist, 0,
sizeof(*hist) * nbits);
1659 for (
int i = 0; i < d; i++) {
1660 const int *ai = accu.data() + i * 256;
1661 int * hi = hist + i * 8;
1662 for (
int j = 0; j < 256; j++)
1663 for (
int k = 0; k < 8; k++)
1675 while (n--) cs = cs * 65713 + a[n] * 1686049;
1681 struct ArgsortComparator {
1683 bool operator() (
const size_t a,
const size_t b)
const {
1684 return vals[a] < vals[b];
1691 size_t len()
const {
1701 template<
typename T>
1702 void parallel_merge (
const T *src, T *dst,
1703 SegmentS &s1, SegmentS & s2,
int nt,
1704 const ArgsortComparator & comp) {
1705 if (s2.len() > s1.len()) {
1710 SegmentS s1s[nt], s2s[nt], sws[nt];
1712 s2s[nt - 1].i1 = s2.i1;
1715 #pragma omp parallel for num_threads(nt)
1716 for (
int t = 0; t < nt; t++) {
1717 s1s[t].i0 = s1.i0 + s1.len() * t / nt;
1718 s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
1721 T pivot = src[s1s[t].i1];
1722 size_t i0 = s2.i0, i1 = s2.i1;
1723 while (i0 + 1 < i1) {
1724 size_t imed = (i1 + i0) / 2;
1725 if (comp (pivot, src[imed])) {i1 = imed; }
1728 s2s[t].i1 = s2s[t + 1].i0 = i1;
1731 s1.i0 = std::min(s1.i0, s2.i0);
1732 s1.i1 = std::max(s1.i1, s2.i1);
1735 for (
int t = 0; t < nt; t++) {
1736 sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
1738 sws[t + 1].i0 = sws[t].i1;
1741 assert(sws[nt - 1].i1 == s1.i1);
1744 #pragma omp parallel for num_threads(nt)
1745 for (
int t = 0; t < nt; t++) {
1746 SegmentS sw = sws[t];
1747 SegmentS s1t = s1s[t];
1748 SegmentS s2t = s2s[t];
1749 if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
1752 if (comp(src[s1t.i0], src[s2t.i0])) {
1753 dst[sw.i0++] = src[s1t.i0++];
1754 if (s1t.i0 == s1t.i1)
break;
1756 dst[sw.i0++] = src[s2t.i0++];
1757 if (s2t.i0 == s2t.i1)
break;
1761 if (s1t.len() > 0) {
1762 assert(s1t.len() == sw.len());
1763 memcpy(dst + sw.i0, src + s1t.i0, s1t.len() *
sizeof(dst[0]));
1764 }
else if (s2t.len() > 0) {
1765 assert(s2t.len() == sw.len());
1766 memcpy(dst + sw.i0, src + s2t.i0, s2t.len() *
sizeof(dst[0]));
1773 void fvec_argsort (
size_t n,
const float *vals,
1776 for (
size_t i = 0; i < n; i++) perm[i] = i;
1777 ArgsortComparator comp = {vals};
1778 std::sort (perm, perm + n, comp);
1781 void fvec_argsort_parallel (
size_t n,
const float *vals,
1784 size_t * perm2 =
new size_t[n];
1786 size_t *permB = perm2, *permA = perm;
1788 int nt = omp_get_max_threads();
1793 nseg = (nseg + 1) / 2;
1794 std::swap (permA, permB);
1798 #pragma omp parallel
1799 for (
size_t i = 0; i < n; i++) permA[i] = i;
1801 ArgsortComparator comp = {vals};
1806 #pragma omp parallel for
1807 for (
int t = 0; t < nt; t++) {
1808 size_t i0 = t * n / nt;
1809 size_t i1 = (t + 1) * n / nt;
1810 SegmentS seg = {i0, i1};
1811 std::sort (permA + seg.i0, permA + seg.i1, comp);
1814 int prev_nested = omp_get_nested();
1819 int nseg1 = (nseg + 1) / 2;
1820 int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
1821 int sub_nseg1 = nseg / 2;
1823 #pragma omp parallel for num_threads(nseg1)
1824 for (
int s = 0; s < nseg; s += 2) {
1825 if (s + 1 == nseg) {
1826 memcpy(permB + segs[s].i0, permA + segs[s].i0,
1827 segs[s].len() *
sizeof(
size_t));
1829 int t0 = s * sub_nt / sub_nseg1;
1830 int t1 = (s + 1) * sub_nt / sub_nseg1;
1831 printf(
"merge %d %d, %d threads\n", s, s + 1, t1 - t0);
1832 parallel_merge(permA, permB, segs[s], segs[s + 1],
1836 for (
int s = 0; s < nseg; s += 2)
1837 segs[s / 2] = segs[s];
1839 std::swap (permA, permB);
1841 assert (permA == perm);
1842 omp_set_nested(prev_nested);
1866 static inline void fvec_madd_ref (
size_t n,
const float *a,
1867 float bf,
const float *b,
float *c) {
1868 for (
size_t i = 0; i < n; i++)
1869 c[i] = a[i] + bf * b[i];
1873 static inline void fvec_madd_sse (
size_t n,
const float *a,
1874 float bf,
const float *b,
float *c) {
1876 __m128 bf4 = _mm_set_ps1 (bf);
1877 __m128 * a4 = (__m128*)a;
1878 __m128 * b4 = (__m128*)b;
1879 __m128 * c4 = (__m128*)c;
1882 *c4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1890 float bf,
const float *b,
float *c)
1893 ((((
long)a) | ((
long)b) | ((
long)c)) & 15) == 0)
1894 fvec_madd_sse (n, a, bf, b, c);
1896 fvec_madd_ref (n, a, bf, b, c);
1899 static inline int fvec_madd_and_argmin_ref (
size_t n,
const float *a,
1900 float bf,
const float *b,
float *c) {
1904 for (
size_t i = 0; i < n; i++) {
1905 c[i] = a[i] + bf * b[i];
1914 static inline int fvec_madd_and_argmin_sse (
size_t n,
const float *a,
1915 float bf,
const float *b,
float *c) {
1917 __m128 bf4 = _mm_set_ps1 (bf);
1918 __m128 vmin4 = _mm_set_ps1 (1e20);
1919 __m128i imin4 = _mm_set1_epi32 (-1);
1920 __m128i idx4 = _mm_set_epi32 (3, 2, 1, 0);
1921 __m128i inc4 = _mm_set1_epi32 (4);
1922 __m128 * a4 = (__m128*)a;
1923 __m128 * b4 = (__m128*)b;
1924 __m128 * c4 = (__m128*)c;
1927 __m128 vc4 = _mm_add_ps (*a4, _mm_mul_ps (bf4, *b4));
1929 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1932 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1933 _mm_andnot_si128 (mask, imin4));
1934 vmin4 = _mm_min_ps (vmin4, vc4);
1938 idx4 = _mm_add_epi32 (idx4, inc4);
1943 idx4 = _mm_shuffle_epi32 (imin4, 3 << 2 | 2);
1944 __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 3 << 2 | 2);
1945 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1946 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1947 _mm_andnot_si128 (mask, imin4));
1948 vmin4 = _mm_min_ps (vmin4, vc4);
1952 idx4 = _mm_shuffle_epi32 (imin4, 1);
1953 __m128 vc4 = _mm_shuffle_ps (vmin4, vmin4, 1);
1954 __m128i mask = (__m128i)_mm_cmpgt_ps (vmin4, vc4);
1955 imin4 = _mm_or_si128 (_mm_and_si128 (mask, idx4),
1956 _mm_andnot_si128 (mask, imin4));
1959 return _mm_extract_epi32 (imin4, 0);
1964 float bf,
const float *b,
float *c)
1967 ((((
long)a) | ((
long)b) | ((
long)c)) & 15) == 0)
1968 return fvec_madd_and_argmin_sse (n, a, bf, b, c);
1970 return fvec_madd_and_argmin_ref (n, a, bf, b, c);
1976 size_t d,
size_t *n,
size_t nmax,
const float *x,
1977 bool verbose,
long seed)
1980 if (*n <= nmax)
return x;
1984 printf (
" Input training set too big (max size is %ld), sampling "
1985 "%ld / %ld vectors\n", nmax, n2, *n);
1987 std::vector<int> subset (*n);
1988 rand_perm (subset.data (), *n, seed);
1989 float *x_subset =
new float[n2 * d];
1990 for (
long i = 0; i < n2; i++)
1991 memcpy (&x_subset[i * d],
1992 &x[subset[i] *
size_t(d)],
random generator that can be used in multithreaded contexts
int km_update_centroids(const float *x, float *centroids, long *assign, size_t d, size_t k, size_t n, size_t k_frozen)
void knn_L2sqr_base_shift(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *base_shift)
RandomGenerator(long seed=1234)
initialize
float fvec_L2sqr(const float *x, const float *y, size_t d)
Squared L2 distance between two vectors.
void bincode_hist(size_t n, size_t nbits, const uint8_t *codes, int *hist)
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
void ranklist_handle_ties(int k, long *idx, const float *dis)
float rand_float()
between 0 and 1
void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)
size_t get_mem_usage_kb()
get current RSS usage in kB
int ivec_hist(size_t n, const int *v, int vmax, int *hist)
compute histogram on v
long rand_long()
random long < 2 ^ 62
size_t merge_result_table_with(size_t n, size_t k, long *I0, float *D0, const long *I1, const float *D1, bool keep_min, long translation)
int rand_int()
random 31-bit positive integer
size_t ranklist_intersection_size(size_t k1, const long *v1, size_t k2, const long *v2_in)
void pairwise_L2sqr(long d, long nq, const float *xq, long nb, const float *xb, float *dis, long ldq, long ldb, long ldd)
void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
same as range_search_L2sqr for the inner product similarity
void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res)
double getmillisecs()
ms elapsed since some arbitrary epoch
double imbalance_factor(int k, const int *hist)
same, takes a histogram as input
float fvec_norm_L2sqr(const float *x, size_t d)
void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *res)
void matrix_qr(int m, int n, float *a)
size_t ivec_checksum(size_t n, const int *a)
compute a checksum on a table.
int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)
void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res)