9 #include "IndexScalarQuantizer.h"
16 #include <immintrin.h>
20 #include "FaissAssert.h"
59 static void encode_component (
float x, uint8_t *code,
int i) {
60 code[i] = (int)(255 * x);
63 static float decode_component (
const uint8_t *code,
int i) {
64 return (code[i] + 0.5f) / 255.0f;
68 static __m256 decode_8_components (
const uint8_t *code,
int i) {
69 uint64_t c8 = *(uint64_t*)(code + i);
70 __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
71 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
73 __m256i i8 = _mm256_castsi128_si256 (c4lo);
74 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
75 __m256 f8 = _mm256_cvtepi32_ps (i8);
76 __m256 half = _mm256_set1_ps (0.5f);
78 __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
87 static void encode_component (
float x, uint8_t *code,
int i) {
88 code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
91 static float decode_component (
const uint8_t *code,
int i) {
92 return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
97 static __m256 decode_8_components (
const uint8_t *code,
int i) {
98 uint32_t c4 = *(uint32_t*)(code + (i >> 1));
99 uint32_t mask = 0x0f0f0f0f;
100 uint32_t c4ev = c4 & mask;
101 uint32_t c4od = (c4 >> 4) & mask;
104 __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
105 _mm_set1_epi32(c4od));
106 __m128i c4lo = _mm_cvtepu8_epi32 (c8);
107 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
108 __m256i i8 = _mm256_castsi128_si256 (c4lo);
109 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
110 __m256 f8 = _mm256_cvtepi32_ps (i8);
111 __m256 half = _mm256_set1_ps (0.5f);
113 __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
125 struct SimilarityL2 {
128 explicit SimilarityL2 (
const float * y): y(y) {}
139 void add_component (
float x) {
140 float tmp = *yi++ - x;
154 accu8 = _mm256_setzero_ps();
158 void add_8_components (__m256 x) {
159 __m256 yiv = _mm256_loadu_ps (yi);
161 __m256 tmp = yiv - x;
166 __m256 sum = _mm256_hadd_ps(accu8, accu8);
167 __m256 sum2 = _mm256_hadd_ps(sum, sum);
170 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
171 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
176 struct SimilarityIP {
184 SimilarityIP (
const float * y,
float accu0):
185 y (y), accu0 (accu0) {}
192 void add_component (
float x) {
206 accu8 = _mm256_setzero_ps();
210 void add_8_components (__m256 x) {
211 __m256 yiv = _mm256_loadu_ps (yi);
217 __m256 sum = _mm256_hadd_ps(accu8, accu8);
218 __m256 sum2 = _mm256_hadd_ps(sum, sum);
222 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
223 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
234 template<
class Quantizer,
class Similarity>
235 float compute_distance(
const Quantizer & quant, Similarity & sim,
239 for (
size_t i = 0; i < quant.d; i++) {
240 float xi = quant.reconstruct_component (code, i);
241 sim.add_component (xi);
247 template<
class Quantizer,
class Similarity>
248 float compute_distance_8(
const Quantizer & quant, Similarity & sim,
252 for (
size_t i = 0; i < quant.d; i += 8) {
253 __m256 xi = quant.reconstruct_8_components (code, i);
254 sim.add_8_components (xi);
256 return sim.result_8();
265 static float sqr (
float x) {
270 void train_Uniform(RangeStat rs,
float rs_arg,
271 idx_t n,
int k,
const float *x,
272 std::vector<float> & trained)
275 float & vmin = trained[0];
276 float & vmax = trained[1];
279 vmin = HUGE_VAL; vmax = -HUGE_VAL;
280 for (
size_t i = 0; i < n; i++) {
281 if (x[i] < vmin) vmin = x[i];
282 if (x[i] > vmax) vmax = x[i];
284 float vexp = (vmax - vmin) * rs_arg;
288 double sum = 0, sum2 = 0;
289 for (
size_t i = 0; i < n; i++) {
293 float mean = sum / n;
294 float var = sum2 / n - mean * mean;
295 float std = var <= 0 ? 1.0 : sqrt(var);
297 vmin = mean - std * rs_arg ;
298 vmax = mean + std * rs_arg ;
300 std::vector<float> x_copy(n);
301 memcpy(x_copy.data(), x, n *
sizeof(*x));
303 std::sort(x_copy.begin(), x_copy.end());
304 int o = int(rs_arg * n);
306 if (o > n - o) o = n / 2;
308 vmax = x_copy[n - 1 - o];
314 vmin = HUGE_VAL, vmax = -HUGE_VAL;
315 for (
size_t i = 0; i < n; i++) {
316 if (x[i] < vmin) vmin = x[i];
317 if (x[i] > vmax) vmax = x[i];
321 a = (vmax - vmin) / (k - 1);
326 int iter_last_err = 0;
327 for (
int it = 0; it < niter; it++) {
328 float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
330 for (idx_t i = 0; i < n; i++) {
332 float ni = floor ((xi - b) / a + 0.5);
334 if (ni >= k) ni = k - 1;
335 err1 += sqr (xi - (ni * a + b));
341 if (err1 == last_err) {
343 if (iter_last_err == 16)
break;
349 float det = sqr (sn) - sn2 * n;
351 b = (sn * sxn - sn2 * sx) / det;
352 a = (sn * sx - n * sxn) / det;
354 printf (
"it %d, err1=%g \r", it, err1);
358 if (verbose) printf(
"\n");
361 vmax = b + a * (k - 1);
364 FAISS_THROW_MSG (
"Invalid qtype");
369 void train_NonUniform(RangeStat rs,
float rs_arg,
370 idx_t n,
int d,
int k,
const float *x,
371 std::vector<float> & trained)
373 trained.resize (2 * d);
374 float * vmin = trained.data();
375 float * vmax = trained.data() + d;
377 memcpy (vmin, x,
sizeof(*x) * d);
378 memcpy (vmax, x,
sizeof(*x) * d);
379 for (
size_t i = 1; i < n; i++) {
380 const float *xi = x + i * d;
381 for (
size_t j = 0; j < d; j++) {
382 if (xi[j] < vmin[j]) vmin[j] = xi[j];
383 if (xi[j] > vmax[j]) vmax[j] = xi[j];
387 for (
size_t j = 0; j < d; j++) {
388 float vexp = (vmax[j] - vmin[j]) * rs_arg;
391 vdiff [j] = vmax[j] - vmin[j];
395 std::vector<float> xt(n * d);
396 for (
size_t i = 1; i < n; i++) {
397 const float *xi = x + i * d;
398 for (
size_t j = 0; j < d; j++) {
399 xt[j * n + i] = xi[j];
402 std::vector<float> trained_d(2);
403 #pragma omp parallel for
404 for (
size_t j = 0; j < d; j++) {
405 train_Uniform(rs, rs_arg,
406 n, k, xt.data() + j * n,
408 vmin[j] = trained_d[0];
409 vmax[j] = trained_d[1];
423 virtual void encode_vector(
const float *x, uint8_t *code)
const = 0;
424 virtual void decode_vector(
const uint8_t *code,
float *x)
const = 0;
426 virtual float compute_distance_L2 (SimilarityL2 &sim,
427 const uint8_t * codes)
const = 0;
428 virtual float compute_distance_IP (SimilarityIP &sim,
429 const uint8_t * codes)
const = 0;
431 virtual ~Quantizer() {}
437 template<
class Codec>
438 struct QuantizerUniform: Quantizer {
440 const float vmin, vdiff;
442 QuantizerUniform(
size_t d,
const std::vector<float> &trained):
443 d(d), vmin(trained[0]), vdiff(trained[1]) {
446 void encode_vector(
const float* x, uint8_t* code)
const override {
447 for (
size_t i = 0; i < d; i++) {
448 float xi = (x[i] - vmin) / vdiff;
453 Codec::encode_component(xi, code, i);
457 void decode_vector(
const uint8_t* code,
float* x)
const override {
458 for (
size_t i = 0; i < d; i++) {
459 float xi = Codec::decode_component(code, i);
460 x[i] = vmin + xi * vdiff;
464 float reconstruct_component (
const uint8_t * code,
int i)
const
466 float xi = Codec::decode_component (code, i);
467 return vmin + xi * vdiff;
471 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
473 __m256 xi = Codec::decode_8_components (code, i);
474 return _mm256_set1_ps(vmin) + xi * _mm256_set1_ps (vdiff);
478 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
480 return compute_distance(*
this, sim, codes);
483 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
485 return compute_distance(*
this, sim, codes);
490 template<
class Codec>
491 struct QuantizerUniform8: QuantizerUniform<Codec> {
493 QuantizerUniform8 (
size_t d,
const std::vector<float> &trained):
494 QuantizerUniform<Codec> (d, trained) {}
496 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
498 return compute_distance_8(*
this, sim, codes);
501 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
503 return compute_distance_8(*
this, sim, codes);
512 template<
class Codec>
513 struct QuantizerNonUniform: Quantizer {
515 const float *vmin, *vdiff;
517 QuantizerNonUniform(
size_t d,
const std::vector<float> &trained):
518 d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
520 void encode_vector(
const float* x, uint8_t* code)
const override {
521 for (
size_t i = 0; i < d; i++) {
522 float xi = (x[i] - vmin[i]) / vdiff[i];
527 Codec::encode_component(xi, code, i);
531 void decode_vector(
const uint8_t* code,
float* x)
const override {
532 for (
size_t i = 0; i < d; i++) {
533 float xi = Codec::decode_component(code, i);
534 x[i] = vmin[i] + xi * vdiff[i];
538 float reconstruct_component (
const uint8_t * code,
int i)
const
540 float xi = Codec::decode_component (code, i);
541 return vmin[i] + xi * vdiff[i];
545 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
547 __m256 xi = Codec::decode_8_components (code, i);
548 return _mm256_loadu_ps(vmin + i) + xi * _mm256_loadu_ps (vdiff + i);
552 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
554 return compute_distance(*
this, sim, codes);
557 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
559 return compute_distance(*
this, sim, codes);
564 template<
class Codec>
565 struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
567 QuantizerNonUniform8 (
size_t d,
const std::vector<float> &trained):
568 QuantizerNonUniform<Codec> (d, trained) {}
570 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
572 return compute_distance_8(*
this, sim, codes);
575 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
577 return compute_distance_8(*
this, sim, codes);
586 Quantizer *select_quantizer (
588 size_t d,
const std::vector<float> & trained)
594 return new QuantizerNonUniform8<Codec8bit>(d, trained);
596 return new QuantizerNonUniform8<Codec4bit>(d, trained);
598 return new QuantizerUniform8<Codec8bit>(d, trained);
599 case ScalarQuantizer::QT_4bit_uniform:
600 return new QuantizerUniform8<Codec4bit>(d, trained);
607 return new QuantizerNonUniform<Codec8bit>(d, trained);
609 return new QuantizerNonUniform<Codec4bit>(d, trained);
611 return new QuantizerUniform<Codec8bit>(d, trained);
612 case ScalarQuantizer::QT_4bit_uniform:
613 return new QuantizerUniform<Codec4bit>(d, trained);
616 FAISS_THROW_MSG (
"unknown qtype");
620 Quantizer *select_quantizer (
const ScalarQuantizer &sq)
622 return select_quantizer (sq.qtype, sq.d, sq.trained);
634 ScalarQuantizer::ScalarQuantizer
635 (
size_t d, QuantizerType qtype):
636 qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
639 case QT_8bit:
case QT_8bit_uniform:
642 case QT_4bit:
case QT_4bit_uniform:
643 code_size = (d + 1) / 2;
649 ScalarQuantizer::ScalarQuantizer ():
651 rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
654 void ScalarQuantizer::train (
size_t n,
const float *x)
657 qtype == QT_4bit_uniform ? 4 :
664 train_Uniform (rangestat, rangestat_arg,
665 n * d, 1 << bit_per_dim, x, trained);
668 train_NonUniform (rangestat, rangestat_arg,
669 n, d, 1 << bit_per_dim, x, trained);
678 Quantizer *squant = select_quantizer (*
this);
680 #pragma omp parallel for
681 for (
size_t i = 0; i < n; i++)
682 squant->encode_vector (x + i * d, codes + i *
code_size);
687 Quantizer *squant = select_quantizer (*
this);
689 #pragma omp parallel for
690 for (
size_t i = 0; i < n; i++)
691 squant->decode_vector (codes + i *
code_size, x + i * d);
698 IndexScalarQuantizer::IndexScalarQuantizer
705 code_size = sq.code_size;
709 IndexScalarQuantizer::IndexScalarQuantizer ():
734 Quantizer *squant = select_quantizer (
sq);
739 #pragma omp parallel for
740 for (
size_t i = 0; i < n; i++) {
741 idx_t *idxi = labels + i * k;
742 float *simi = distances + i * k;
743 minheap_heapify (k, simi, idxi);
745 SimilarityIP sim(x + i * d, 0);
746 const uint8_t *ci =
codes.data ();
748 for (
size_t j = 0; j <
ntotal; j++) {
749 float accu = squant->compute_distance_IP(sim, ci);
751 if (accu > simi [0]) {
752 minheap_pop (k, simi, idxi);
753 minheap_push (k, simi, idxi, accu, j);
757 minheap_reorder (k, simi, idxi);
760 #pragma omp parallel for
761 for (
size_t i = 0; i < n; i++) {
762 idx_t *idxi = labels + i * k;
763 float *simi = distances + i * k;
764 maxheap_heapify (k, simi, idxi);
766 SimilarityL2 sim(x + i * d);
767 const uint8_t *ci =
codes.data ();
769 for (
size_t j = 0; j <
ntotal; j++) {
770 float accu = squant->compute_distance_L2(sim, ci);
772 if (accu < simi [0]) {
773 maxheap_pop (k, simi, idxi);
774 maxheap_push (k, simi, idxi, accu, j);
778 maxheap_reorder (k, simi, idxi);
791 idx_t i0, idx_t ni,
float* recons)
const
793 Quantizer *squant = select_quantizer (
sq);
795 for (
size_t i = 0; i < ni; i++) {
796 squant->decode_vector(&
codes[(i + i0) * code_size], recons + i * d);
810 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
811 (
Index *quantizer,
size_t d,
size_t nlist,
813 IndexIVF (quantizer, d, nlist, metric),
820 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
826 long * idx =
new long [n];
829 float *residuals =
new float [n *
d];
832 #pragma omp parallel for
833 for (idx_t i = 0; i < n; i++) {
837 sq.train (n, residuals);
843 (idx_t n,
const float * x,
const long *xids)
845 FAISS_THROW_IF_NOT (is_trained);
846 long * idx =
new long [n];
848 quantizer->assign (n, x, idx);
850 Quantizer *squant = select_quantizer (sq);
853 #pragma omp parallel reduction(+: nadd)
855 std::vector<float> residual (d);
856 int nt = omp_get_num_threads();
857 int rank = omp_get_thread_num();
860 for (
size_t i = 0; i < n; i++) {
862 long list_no = idx [i];
863 if (list_no >= 0 && list_no % nt == rank) {
864 long id = xids ? xids[i] : ntotal + i;
866 assert (list_no < nlist);
868 ids[list_no].push_back (
id);
870 quantizer->compute_residual (
871 x + i * d, residual.data(), list_no);
873 size_t cur_size = codes[list_no].size();
874 codes[list_no].resize (cur_size + code_size);
876 squant->encode_vector (residual.data(),
877 codes[list_no].data() + cur_size);
888 const idx_t *cent_ids,
const float *cent_dis,
889 const Quantizer & quant,
890 int k,
float *simi, idx_t *idxi,
893 int nprobe = index.
nprobe;
896 std::vector<float> decoded(d);
897 minheap_heapify (k, simi, idxi);
898 for (
int i = 0; i < nprobe; i++) {
899 idx_t list_no = cent_ids[i];
900 if (list_no < 0)
break;
901 float accu0 = cent_dis[i];
903 const std::vector<idx_t> & ids = index.
ids[list_no];
904 const uint8_t* codes = index.codes[list_no].data();
906 SimilarityIP sim(x, accu0);
908 for (
size_t j = 0; j < ids.size(); j++) {
910 float accu = quant.compute_distance_IP(sim, codes);
912 if (accu > simi [0]) {
913 minheap_pop (k, simi, idxi);
914 long id = store_pairs ? (list_no << 32 | j) : ids[j];
915 minheap_push (k, simi, idxi, accu,
id);
921 minheap_reorder (k, simi, idxi);
924 void search_with_probes_L2 (
const IndexIVFScalarQuantizer & index,
926 const idx_t *cent_ids,
927 const Index *quantizer,
928 const Quantizer & quant,
929 int k,
float *simi, idx_t *idxi,
932 int nprobe = index.nprobe;
933 size_t code_size = index.code_size;
935 std::vector<float> decoded(d), x(d);
936 maxheap_heapify (k, simi, idxi);
937 for (
int i = 0; i < nprobe; i++) {
938 idx_t list_no = cent_ids[i];
939 if (list_no < 0)
break;
941 const std::vector<idx_t> & ids = index.ids[list_no];
942 const uint8_t* codes = index.codes[list_no].data();
945 quantizer->compute_residual (x_in, x.data(), list_no);
947 SimilarityL2 sim(x.data());
949 for (
size_t j = 0; j < ids.size(); j++) {
951 float dis = quant.compute_distance_L2 (sim, codes);
953 if (dis < simi [0]) {
954 maxheap_pop (k, simi, idxi);
955 long id = store_pairs ? (list_no << 32 | j) : ids[j];
956 maxheap_push (k, simi, idxi, dis,
id);
961 maxheap_reorder (k, simi, idxi);
967 idx_t n,
const float *x, idx_t k,
970 float *distances, idx_t *labels,
971 bool store_pairs)
const
975 Quantizer *squant = select_quantizer (sq);
979 #pragma omp parallel for
980 for (
size_t i = 0; i < n; i++) {
981 search_with_probes_ip (*
this, x + i * d,
982 idx + i * nprobe, dis + i * nprobe, *squant,
983 k, distances + i * k, labels + i * k,
987 #pragma omp parallel for
988 for (
size_t i = 0; i < n; i++) {
989 search_with_probes_L2 (*
this, x + i * d,
990 idx + i * nprobe, quantizer, *squant,
991 k, distances + i * k, labels + i * k,
size_t code_size
bytes per vector
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void train_residual(idx_t n, const float *x) override
alternate optimization of reconstruction error
same, shared range for all dimensions
void add(idx_t n, const float *x) override
size_t nprobe
number of probes at query time
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
void reset() override
removes all elements from the database.
std::vector< std::vector< long > > ids
Inverted lists for indexes.
void add_with_ids(idx_t n, const float *x, const long *xids) override
Index * quantizer
quantizer that maps vectors to inverted lists
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
ScalarQuantizer sq
Used to encode the vectors.
long idx_t
all indices are this type
idx_t ntotal
total nb of indexed vectors
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
[mean - std * rs, mean + std * rs]
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void reconstruct(idx_t key, float *recons) const override
[min - rs*(max-min), max + rs*(max-min)]
bool is_trained
set if the Index does not require training, or if training is done already
void compute_residual(const float *x, float *residual, idx_t key) const
void train(idx_t n, const float *x) override
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.