10 #include "IndexScalarQuantizer.h"
18 #include <immintrin.h>
22 #include "FaissAssert.h"
23 #include "AuxIndexStructures.h"
75 static void encode_component (
float x, uint8_t *code,
int i) {
76 code[i] = (int)(255 * x);
79 static float decode_component (
const uint8_t *code,
int i) {
80 return (code[i] + 0.5f) / 255.0f;
84 static __m256 decode_8_components (
const uint8_t *code,
int i) {
85 uint64_t c8 = *(uint64_t*)(code + i);
86 __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
87 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
89 __m256i i8 = _mm256_castsi128_si256 (c4lo);
90 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
91 __m256 f8 = _mm256_cvtepi32_ps (i8);
92 __m256 half = _mm256_set1_ps (0.5f);
94 __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
103 static void encode_component (
float x, uint8_t *code,
int i) {
104 code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
107 static float decode_component (
const uint8_t *code,
int i) {
108 return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
113 static __m256 decode_8_components (
const uint8_t *code,
int i) {
114 uint32_t c4 = *(uint32_t*)(code + (i >> 1));
115 uint32_t mask = 0x0f0f0f0f;
116 uint32_t c4ev = c4 & mask;
117 uint32_t c4od = (c4 >> 4) & mask;
120 __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
121 _mm_set1_epi32(c4od));
122 __m128i c4lo = _mm_cvtepu8_epi32 (c8);
123 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
124 __m256i i8 = _mm256_castsi128_si256 (c4lo);
125 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
126 __m256 f8 = _mm256_cvtepi32_ps (i8);
127 __m256 half = _mm256_set1_ps (0.5f);
129 __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
137 static void encode_component (
float x, uint8_t *code,
int i) {
138 int bits = (int)(x * 63.0);
139 code += (i >> 2) * 3;
145 code[0] |= bits << 6;
146 code[1] |= bits >> 2;
149 code[1] |= bits << 4;
150 code[2] |= bits >> 4;
153 code[2] |= bits << 2;
158 static float decode_component (
const uint8_t *code,
int i) {
160 code += (i >> 2) * 3;
163 bits = code[0] & 0x3f;
167 bits |= (code[1] & 0xf) << 2;
171 bits |= (code[2] & 3) << 4;
177 return (bits + 0.5f) / 63.0f;
181 static __m256 decode_8_components (
const uint8_t *code,
int i) {
183 (decode_component(code, i + 7),
184 decode_component(code, i + 6),
185 decode_component(code, i + 5),
186 decode_component(code, i + 4),
187 decode_component(code, i + 3),
188 decode_component(code, i + 2),
189 decode_component(code, i + 1),
190 decode_component(code, i + 0));
200 uint16_t encode_fp16 (
float x) {
201 __m128 xf = _mm_set1_ps (x);
202 __m128i xi = _mm_cvtps_ph (
203 xf, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
204 return _mm_cvtsi128_si32 (xi) & 0xffff;
208 float decode_fp16 (uint16_t x) {
209 __m128i xi = _mm_set1_epi16 (x);
210 __m128 xf = _mm_cvtph_ps (xi);
211 return _mm_cvtss_f32 (xf);
219 float floatbits (uint32_t x) {
221 return *(
float*)xptr;
224 uint32_t intbits (
float f) {
226 return *(uint32_t*)fptr;
230 uint16_t encode_fp16 (
float f) {
234 uint32_t sign_mask = 0x80000000u;
237 uint32_t fint = intbits(f);
238 uint32_t sign = fint & sign_mask;
250 uint32_t f32infty = 255u << 23;
251 o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
257 const uint32_t round_mask = ~0xfffu;
258 const uint32_t magic = 15u << 23;
278 float fscale = floatbits(fint & round_mask) * floatbits(magic);
279 fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
280 int32_t fint2 = intbits(fscale) - round_mask;
285 return (o | (sign >> 16));
288 float decode_fp16 (uint16_t h) {
293 const uint32_t shifted_exp = 0x7c00u << 13;
295 int32_t o = ((int32_t)(h & 0x7fffu)) << 13;
296 int32_t exp = shifted_exp & o;
297 o += (int32_t)(127 - 15) << 23;
299 int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
300 int32_t zerodenorm_val = intbits(
301 floatbits(o + (1u<<23)) - floatbits(113u << 23));
302 int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
304 int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
305 return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
321 virtual void encode_vector(
const float *x, uint8_t *code)
const = 0;
322 virtual void decode_vector(
const uint8_t *code,
float *x)
const = 0;
324 virtual ~Quantizer() {}
328 template<
class Codec,
bool uniform,
int SIMD>
329 struct QuantizerTemplate {};
332 template<
class Codec>
333 struct QuantizerTemplate<Codec, true, 1>: Quantizer {
335 const float vmin, vdiff;
337 QuantizerTemplate(
size_t d,
const std::vector<float> &trained):
338 d(d), vmin(trained[0]), vdiff(trained[1])
342 void encode_vector(
const float* x, uint8_t* code)
const final {
343 for (
size_t i = 0; i < d; i++) {
344 float xi = (x[i] - vmin) / vdiff;
351 Codec::encode_component(xi, code, i);
355 void decode_vector(
const uint8_t* code,
float* x)
const final {
356 for (
size_t i = 0; i < d; i++) {
357 float xi = Codec::decode_component(code, i);
358 x[i] = vmin + xi * vdiff;
362 float reconstruct_component (
const uint8_t * code,
int i)
const
364 float xi = Codec::decode_component (code, i);
365 return vmin + xi * vdiff;
374 template<
class Codec>
375 struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
377 QuantizerTemplate (
size_t d,
const std::vector<float> &trained):
378 QuantizerTemplate<Codec, true, 1> (d, trained) {}
380 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
382 __m256 xi = Codec::decode_8_components (code, i);
383 return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
392 template<
class Codec>
393 struct QuantizerTemplate<Codec, false, 1>: Quantizer {
395 const float *vmin, *vdiff;
397 QuantizerTemplate (
size_t d,
const std::vector<float> &trained):
398 d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
400 void encode_vector(
const float* x, uint8_t* code)
const final {
401 for (
size_t i = 0; i < d; i++) {
402 float xi = (x[i] - vmin[i]) / vdiff[i];
407 Codec::encode_component(xi, code, i);
411 void decode_vector(
const uint8_t* code,
float* x)
const final {
412 for (
size_t i = 0; i < d; i++) {
413 float xi = Codec::decode_component(code, i);
414 x[i] = vmin[i] + xi * vdiff[i];
418 float reconstruct_component (
const uint8_t * code,
int i)
const
420 float xi = Codec::decode_component (code, i);
421 return vmin[i] + xi * vdiff[i];
429 template<
class Codec>
430 struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
432 QuantizerTemplate (
size_t d,
const std::vector<float> &trained):
433 QuantizerTemplate<Codec, false, 1> (d, trained) {}
435 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
437 __m256 xi = Codec::decode_8_components (code, i);
438 return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
450 template<
int SIMDWIDTH>
451 struct QuantizerFP16 {};
454 struct QuantizerFP16<1>: Quantizer {
457 QuantizerFP16(
size_t d,
const std::vector<float> & ):
460 void encode_vector(
const float* x, uint8_t* code)
const final {
461 for (
size_t i = 0; i < d; i++) {
462 ((uint16_t*)code)[i] = encode_fp16(x[i]);
466 void decode_vector(
const uint8_t* code,
float* x)
const final {
467 for (
size_t i = 0; i < d; i++) {
468 x[i] = decode_fp16(((uint16_t*)code)[i]);
472 float reconstruct_component (
const uint8_t * code,
int i)
const
474 return decode_fp16(((uint16_t*)code)[i]);
482 struct QuantizerFP16<8>: QuantizerFP16<1> {
484 QuantizerFP16 (
size_t d,
const std::vector<float> &trained):
485 QuantizerFP16<1> (d, trained) {}
487 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
489 __m128i codei = _mm_loadu_si128 ((
const __m128i*)(code + 2 * i));
490 return _mm256_cvtph_ps (codei);
501 template<
int SIMDWIDTH>
502 struct Quantizer8bitDirect {};
505 struct Quantizer8bitDirect<1>: Quantizer {
508 Quantizer8bitDirect(
size_t d,
const std::vector<float> & ):
512 void encode_vector(
const float* x, uint8_t* code)
const final {
513 for (
size_t i = 0; i < d; i++) {
514 code[i] = (uint8_t)x[i];
518 void decode_vector(
const uint8_t* code,
float* x)
const final {
519 for (
size_t i = 0; i < d; i++) {
524 float reconstruct_component (
const uint8_t * code,
int i)
const
534 struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> {
536 Quantizer8bitDirect (
size_t d,
const std::vector<float> &trained):
537 Quantizer8bitDirect<1> (d, trained) {}
539 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
541 __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i));
542 __m256i y8 = _mm256_cvtepu8_epi32 (x8);
543 return _mm256_cvtepi32_ps (y8);
551 template<
int SIMDWIDTH>
552 Quantizer *select_quantizer (
554 size_t d,
const std::vector<float> & trained)
558 return new QuantizerTemplate<Codec8bit, false, SIMDWIDTH>(d, trained);
560 return new QuantizerTemplate<Codec6bit, false, SIMDWIDTH>(d, trained);
562 return new QuantizerTemplate<Codec4bit, false, SIMDWIDTH>(d, trained);
564 return new QuantizerTemplate<Codec8bit, true, SIMDWIDTH>(d, trained);
565 case ScalarQuantizer::QT_4bit_uniform:
566 return new QuantizerTemplate<Codec4bit, true, SIMDWIDTH>(d, trained);
567 case ScalarQuantizer::QT_fp16:
568 return new QuantizerFP16<SIMDWIDTH> (d, trained);
569 case ScalarQuantizer::QT_8bit_direct:
570 return new Quantizer8bitDirect<SIMDWIDTH> (d, trained);
572 FAISS_THROW_MSG (
"unknown qtype");
577 Quantizer *select_quantizer (
const ScalarQuantizer &sq)
581 return select_quantizer<8> (sq.qtype, sq.d, sq.trained);
585 return select_quantizer<1> (sq.qtype, sq.d, sq.trained);
596 static float sqr (
float x) {
601 void train_Uniform(RangeStat rs,
float rs_arg,
602 idx_t n,
int k,
const float *x,
603 std::vector<float> & trained)
606 float & vmin = trained[0];
607 float & vmax = trained[1];
610 vmin = HUGE_VAL; vmax = -HUGE_VAL;
611 for (
size_t i = 0; i < n; i++) {
612 if (x[i] < vmin) vmin = x[i];
613 if (x[i] > vmax) vmax = x[i];
615 float vexp = (vmax - vmin) * rs_arg;
619 double sum = 0, sum2 = 0;
620 for (
size_t i = 0; i < n; i++) {
624 float mean = sum / n;
625 float var = sum2 / n - mean * mean;
626 float std = var <= 0 ? 1.0 : sqrt(var);
628 vmin = mean - std * rs_arg ;
629 vmax = mean + std * rs_arg ;
631 std::vector<float> x_copy(n);
632 memcpy(x_copy.data(), x, n *
sizeof(*x));
634 std::sort(x_copy.begin(), x_copy.end());
635 int o = int(rs_arg * n);
637 if (o > n - o) o = n / 2;
639 vmax = x_copy[n - 1 - o];
645 vmin = HUGE_VAL, vmax = -HUGE_VAL;
646 for (
size_t i = 0; i < n; i++) {
647 if (x[i] < vmin) vmin = x[i];
648 if (x[i] > vmax) vmax = x[i];
652 a = (vmax - vmin) / (k - 1);
657 int iter_last_err = 0;
658 for (
int it = 0; it < niter; it++) {
659 float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
661 for (idx_t i = 0; i < n; i++) {
663 float ni = floor ((xi - b) / a + 0.5);
665 if (ni >= k) ni = k - 1;
666 err1 += sqr (xi - (ni * a + b));
672 if (err1 == last_err) {
674 if (iter_last_err == 16)
break;
680 float det = sqr (sn) - sn2 * n;
682 b = (sn * sxn - sn2 * sx) / det;
683 a = (sn * sx - n * sxn) / det;
685 printf (
"it %d, err1=%g \r", it, err1);
689 if (verbose) printf(
"\n");
692 vmax = b + a * (k - 1);
695 FAISS_THROW_MSG (
"Invalid qtype");
700 void train_NonUniform(RangeStat rs,
float rs_arg,
701 idx_t n,
int d,
int k,
const float *x,
702 std::vector<float> & trained)
705 trained.resize (2 * d);
706 float * vmin = trained.data();
707 float * vmax = trained.data() + d;
709 memcpy (vmin, x,
sizeof(*x) * d);
710 memcpy (vmax, x,
sizeof(*x) * d);
711 for (
size_t i = 1; i < n; i++) {
712 const float *xi = x + i * d;
713 for (
size_t j = 0; j < d; j++) {
714 if (xi[j] < vmin[j]) vmin[j] = xi[j];
715 if (xi[j] > vmax[j]) vmax[j] = xi[j];
719 for (
size_t j = 0; j < d; j++) {
720 float vexp = (vmax[j] - vmin[j]) * rs_arg;
723 vdiff [j] = vmax[j] - vmin[j];
727 std::vector<float> xt(n * d);
728 for (
size_t i = 1; i < n; i++) {
729 const float *xi = x + i * d;
730 for (
size_t j = 0; j < d; j++) {
731 xt[j * n + i] = xi[j];
734 std::vector<float> trained_d(2);
735 #pragma omp parallel for
736 for (
size_t j = 0; j < d; j++) {
737 train_Uniform(rs, rs_arg,
738 n, k, xt.data() + j * n,
740 vmin[j] = trained_d[0];
741 vmax[j] = trained_d[1];
754 template<
int SIMDWIDTH>
755 struct SimilarityL2 {};
759 struct SimilarityL2<1> {
760 static constexpr
int simdwidth = 1;
761 static constexpr
MetricType metric_type = METRIC_L2;
765 explicit SimilarityL2 (
const float * y): y(y) {}
776 void add_component (
float x) {
777 float tmp = *yi++ - x;
781 void add_component_2 (
float x1,
float x2) {
794 struct SimilarityL2<8> {
795 static constexpr
int simdwidth = 8;
796 static constexpr
MetricType metric_type = METRIC_L2;
800 explicit SimilarityL2 (
const float * y): y(y) {}
804 accu8 = _mm256_setzero_ps();
808 void add_8_components (__m256 x) {
809 __m256 yiv = _mm256_loadu_ps (yi);
811 __m256 tmp = yiv - x;
815 void add_8_components_2 (__m256 x, __m256 y) {
821 __m256 sum = _mm256_hadd_ps(accu8, accu8);
822 __m256 sum2 = _mm256_hadd_ps(sum, sum);
825 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
826 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
834 template<
int SIMDWIDTH>
835 struct SimilarityIP {};
839 struct SimilarityIP<1> {
840 static constexpr
int simdwidth = 1;
841 static constexpr
MetricType metric_type = METRIC_INNER_PRODUCT;
846 explicit SimilarityIP (
const float * y):
854 void add_component (
float x) {
858 void add_component_2 (
float x1,
float x2) {
870 struct SimilarityIP<8> {
871 static constexpr
int simdwidth = 8;
872 static constexpr
MetricType metric_type = METRIC_INNER_PRODUCT;
878 explicit SimilarityIP (
const float * y):
884 accu8 = _mm256_setzero_ps();
888 void add_8_components (__m256 x) {
889 __m256 yiv = _mm256_loadu_ps (yi);
894 void add_8_components_2 (__m256 x1, __m256 x2) {
899 __m256 sum = _mm256_hadd_ps(accu8, accu8);
900 __m256 sum2 = _mm256_hadd_ps(sum, sum);
903 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
904 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
915 template<
class Quantizer,
class Similarity,
int SIMDWIDTH>
916 struct DCTemplate : SQDistanceComputer {};
918 template<
class Quantizer,
class Similarity>
919 struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer
921 using Sim = Similarity;
925 DCTemplate(
size_t d,
const std::vector<float> &trained):
929 float compute_distance(
const float* x,
const uint8_t* code)
const {
933 for (
size_t i = 0; i < quant.d; i++) {
934 float xi = quant.reconstruct_component(code, i);
935 sim.add_component(xi);
940 float compute_code_distance(
const uint8_t* code1,
const uint8_t* code2)
942 Similarity sim(
nullptr);
944 for (
size_t i = 0; i < quant.d; i++) {
945 float x1 = quant.reconstruct_component(code1, i);
946 float x2 = quant.reconstruct_component(code2, i);
947 sim.add_component_2(x1, x2);
952 void set_query (
const float *x)
final {
957 float operator () (idx_t i)
final {
958 return compute_distance (q, codes + i * code_size);
961 float symmetric_dis (idx_t i, idx_t j)
override {
962 return compute_code_distance (codes + i * code_size,
963 codes + j * code_size);
966 float query_to_code (
const uint8_t * code)
const {
967 return compute_distance (q, code);
974 template<
class Quantizer,
class Similarity>
975 struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer
977 using Sim = Similarity;
981 DCTemplate(
size_t d,
const std::vector<float> &trained):
985 float compute_distance(
const float* x,
const uint8_t* code)
const {
989 for (
size_t i = 0; i < quant.d; i += 8) {
990 __m256 xi = quant.reconstruct_8_components(code, i);
991 sim.add_8_components(xi);
993 return sim.result_8();
996 float compute_code_distance(
const uint8_t* code1,
const uint8_t* code2)
998 Similarity sim(
nullptr);
1000 for (
size_t i = 0; i < quant.d; i += 8) {
1001 __m256 x1 = quant.reconstruct_8_components(code1, i);
1002 __m256 x2 = quant.reconstruct_8_components(code2, i);
1003 sim.add_8_components_2(x1, x2);
1005 return sim.result_8();
1008 void set_query (
const float *x)
final {
1013 float operator () (idx_t i)
final {
1014 return compute_distance (q, codes + i * code_size);
1017 float symmetric_dis (idx_t i, idx_t j)
override {
1018 return compute_code_distance (codes + i * code_size,
1019 codes + j * code_size);
1022 float query_to_code (
const uint8_t * code)
const {
1023 return compute_distance (q, code);
1036 template<
class Similarity,
int SIMDWIDTH>
1037 struct DistanceComputerByte : SQDistanceComputer {};
1039 template<
class Similarity>
1040 struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
1041 using Sim = Similarity;
1044 std::vector<uint8_t> tmp;
1046 DistanceComputerByte(
int d,
const std::vector<float> &): d(d), tmp(d) {
1049 int compute_code_distance(
const uint8_t* code1,
const uint8_t* code2)
1052 for (
int i = 0; i < d; i++) {
1053 if (Sim::metric_type == METRIC_INNER_PRODUCT) {
1054 accu += int(code1[i]) * code2[i];
1056 int diff = int(code1[i]) - code2[i];
1057 accu += diff * diff;
1063 void set_query (
const float *x)
final {
1064 for (
int i = 0; i < d; i++) {
1069 int compute_distance(
const float* x,
const uint8_t* code) {
1071 return compute_code_distance(tmp.data(), code);
1075 float operator () (idx_t i)
final {
1076 return compute_distance (q, codes + i * code_size);
1079 float symmetric_dis (idx_t i, idx_t j)
override {
1080 return compute_code_distance (codes + i * code_size,
1081 codes + j * code_size);
1084 float query_to_code (
const uint8_t * code)
const {
1085 return compute_code_distance (tmp.data(), code);
1093 template<
class Similarity>
1094 struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
1095 using Sim = Similarity;
1098 std::vector<uint8_t> tmp;
1100 DistanceComputerByte(
int d,
const std::vector<float> &): d(d), tmp(d) {
1103 int compute_code_distance(
const uint8_t* code1,
const uint8_t* code2)
1106 __m256i accu = _mm256_setzero_si256 ();
1107 for (
int i = 0; i < d; i += 16) {
1109 __m256i c1 = _mm256_cvtepu8_epi16
1110 (_mm_loadu_si128((__m128i*)(code1 + i)));
1111 __m256i c2 = _mm256_cvtepu8_epi16
1112 (_mm_loadu_si128((__m128i*)(code2 + i)));
1114 if (Sim::metric_type == METRIC_INNER_PRODUCT) {
1115 prod32 = _mm256_madd_epi16(c1, c2);
1117 __m256i diff = _mm256_sub_epi16(c1, c2);
1118 prod32 = _mm256_madd_epi16(diff, diff);
1120 accu = _mm256_add_epi32 (accu, prod32);
1123 __m128i sum = _mm256_extractf128_si256(accu, 0);
1124 sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
1125 sum = _mm_hadd_epi32 (sum, sum);
1126 sum = _mm_hadd_epi32 (sum, sum);
1127 return _mm_cvtsi128_si32 (sum);
1130 void set_query (
const float *x)
final {
1136 for (
int i = 0; i < d; i++) {
1141 int compute_distance(
const float* x,
const uint8_t* code) {
1143 return compute_code_distance(tmp.data(), code);
1147 float operator () (idx_t i)
final {
1148 return compute_distance (q, codes + i * code_size);
1151 float symmetric_dis (idx_t i, idx_t j)
override {
1152 return compute_code_distance (codes + i * code_size,
1153 codes + j * code_size);
1156 float query_to_code (
const uint8_t * code)
const {
1157 return compute_code_distance (tmp.data(), code);
1172 SQDistanceComputer *select_distance_computer (
1173 QuantizerType qtype,
1174 size_t d,
const std::vector<float> & trained)
1176 constexpr
int SIMDWIDTH = Sim::simdwidth;
1179 return new DCTemplate<QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
1180 Sim, SIMDWIDTH>(d, trained);
1182 case ScalarQuantizer::QT_4bit_uniform:
1183 return new DCTemplate<QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
1184 Sim, SIMDWIDTH>(d, trained);
1187 return new DCTemplate<QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
1188 Sim, SIMDWIDTH>(d, trained);
1191 return new DCTemplate<QuantizerTemplate<Codec6bit, false, SIMDWIDTH>,
1192 Sim, SIMDWIDTH>(d, trained);
1195 return new DCTemplate<QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
1196 Sim, SIMDWIDTH>(d, trained);
1198 case ScalarQuantizer::QT_fp16:
1199 return new DCTemplate
1200 <QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
1202 case ScalarQuantizer::QT_8bit_direct:
1204 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
1206 return new DCTemplate
1207 <Quantizer8bitDirect<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
1210 FAISS_THROW_MSG (
"unknown qtype");
1224 ScalarQuantizer::ScalarQuantizer
1225 (
size_t d, QuantizerType qtype):
1226 qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
1230 case QT_8bit_uniform:
1231 case QT_8bit_direct:
1235 case QT_4bit_uniform:
1236 code_size = (d + 1) / 2;
1239 code_size = (d * 6 + 7) / 8;
1248 ScalarQuantizer::ScalarQuantizer ():
1250 rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
1253 void ScalarQuantizer::train (
size_t n,
const float *x)
1256 qtype == QT_4bit_uniform ? 4 :
1264 train_Uniform (rangestat, rangestat_arg,
1265 n * d, 1 << bit_per_dim, x, trained);
1268 train_NonUniform (rangestat, rangestat_arg,
1269 n, d, 1 << bit_per_dim, x, trained);
1272 case QT_8bit_direct:
1282 Quantizer *squant = select_quantizer (*
this);
1285 #pragma omp parallel for
1286 for (
size_t i = 0; i < n; i++)
1287 squant->encode_vector (x + i * d, codes + i *
code_size);
1292 Quantizer *squant = select_quantizer (*
this);
1294 #pragma omp parallel for
1295 for (
size_t i = 0; i < n; i++)
1296 squant->decode_vector (codes + i *
code_size, x + i * d);
1301 ScalarQuantizer::get_distance_computer (
MetricType metric)
const
1305 if (metric == METRIC_L2) {
1306 return select_distance_computer<SimilarityL2<8> >
1309 return select_distance_computer<SimilarityIP<8> >
1315 if (metric == METRIC_L2) {
1316 return select_distance_computer<SimilarityL2<1> >
1319 return select_distance_computer<SimilarityIP<1> >
1336 template<
class DCClass>
1337 struct IVFSQScannerIP: InvertedListScanner {
1339 bool store_pairs, by_residual;
1346 IVFSQScannerIP(
int d,
const std::vector<float> & trained,
1347 size_t code_size,
bool store_pairs,
1349 dc(d, trained), store_pairs(store_pairs),
1350 by_residual(by_residual),
1351 code_size(code_size), list_no(0), accu0(0)
1355 void set_query (
const float *query)
override {
1356 dc.set_query (query);
1359 void set_list (idx_t list_no,
float coarse_dis)
override {
1360 this->list_no = list_no;
1361 accu0 = by_residual ? coarse_dis : 0;
1364 float distance_to_code (
const uint8_t *code)
const final {
1365 return accu0 + dc.query_to_code (code);
1368 size_t scan_codes (
size_t list_size,
1369 const uint8_t *codes,
1371 float *simi, idx_t *idxi,
1372 size_t k)
const override
1376 for (
size_t j = 0; j < list_size; j++) {
1378 float accu = accu0 + dc.query_to_code (codes);
1380 if (accu > simi [0]) {
1381 minheap_pop (k, simi, idxi);
1382 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1383 minheap_push (k, simi, idxi, accu,
id);
1391 void scan_codes_range (
size_t list_size,
1392 const uint8_t *codes,
1395 RangeQueryResult & res)
const override
1397 for (
size_t j = 0; j < list_size; j++) {
1398 float accu = accu0 + dc.query_to_code (codes);
1399 if (accu > radius) {
1400 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1411 template<
class DCClass>
1412 struct IVFSQScannerL2: InvertedListScanner {
1416 bool store_pairs, by_residual;
1418 const Index *quantizer;
1422 std::vector<float> tmp;
1424 IVFSQScannerL2(
int d,
const std::vector<float> & trained,
1425 size_t code_size,
const Index *quantizer,
1426 bool store_pairs,
bool by_residual):
1427 dc(d, trained), store_pairs(store_pairs), by_residual(by_residual),
1428 code_size(code_size), quantizer(quantizer),
1429 list_no (0), x (nullptr), tmp (d)
1434 void set_query (
const float *query)
override {
1437 dc.set_query (query);
1442 void set_list (idx_t list_no,
float )
override {
1444 this->list_no = list_no;
1446 quantizer->compute_residual (x, tmp.data(), list_no);
1447 dc.set_query (tmp.data ());
1453 float distance_to_code (
const uint8_t *code)
const final {
1454 return dc.query_to_code (code);
1457 size_t scan_codes (
size_t list_size,
1458 const uint8_t *codes,
1460 float *simi, idx_t *idxi,
1461 size_t k)
const override
1464 for (
size_t j = 0; j < list_size; j++) {
1466 float dis = dc.query_to_code (codes);
1468 if (dis < simi [0]) {
1469 maxheap_pop (k, simi, idxi);
1470 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1471 maxheap_push (k, simi, idxi, dis,
id);
1479 void scan_codes_range (
size_t list_size,
1480 const uint8_t *codes,
1483 RangeQueryResult & res)
const override
1485 for (
size_t j = 0; j < list_size; j++) {
1486 float dis = dc.query_to_code (codes);
1488 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1498 template<
class DCClass>
1499 InvertedListScanner* sel2_InvertedListScanner
1500 (
const ScalarQuantizer *sq,
1501 const Index *quantizer,
bool store_pairs,
bool r)
1503 if (DCClass::Sim::metric_type == METRIC_L2) {
1504 return new IVFSQScannerL2<DCClass>(sq->d, sq->trained, sq->code_size,
1505 quantizer, store_pairs, r);
1507 return new IVFSQScannerIP<DCClass>(sq->d, sq->trained, sq->code_size,
1512 template<
class Similarity,
class Codec,
bool uniform>
1513 InvertedListScanner* sel12_InvertedListScanner
1514 (
const ScalarQuantizer *sq,
1515 const Index *quantizer,
bool store_pairs,
bool r)
1517 constexpr
int SIMDWIDTH = Similarity::simdwidth;
1518 using QuantizerClass = QuantizerTemplate<Codec, uniform, SIMDWIDTH>;
1519 using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
1520 return sel2_InvertedListScanner<DCClass> (sq, quantizer, store_pairs, r);
1525 template<
class Similarity>
1526 InvertedListScanner* sel1_InvertedListScanner
1527 (
const ScalarQuantizer *sq,
const Index *quantizer,
1528 bool store_pairs,
bool r)
1530 constexpr
int SIMDWIDTH = Similarity::simdwidth;
1533 return sel12_InvertedListScanner
1534 <Similarity, Codec8bit,
true>(sq, quantizer, store_pairs, r);
1535 case ScalarQuantizer::QT_4bit_uniform:
1536 return sel12_InvertedListScanner
1537 <Similarity, Codec4bit,
true>(sq, quantizer, store_pairs, r);
1539 return sel12_InvertedListScanner
1540 <Similarity, Codec8bit,
false>(sq, quantizer, store_pairs, r);
1542 return sel12_InvertedListScanner
1543 <Similarity, Codec4bit,
false>(sq, quantizer, store_pairs, r);
1545 return sel12_InvertedListScanner
1546 <Similarity, Codec6bit,
false>(sq, quantizer, store_pairs, r);
1547 case ScalarQuantizer::QT_fp16:
1548 return sel2_InvertedListScanner
1549 <DCTemplate<QuantizerFP16<SIMDWIDTH>, Similarity, SIMDWIDTH> >
1550 (sq, quantizer, store_pairs, r);
1551 case ScalarQuantizer::QT_8bit_direct:
1552 if (sq->d % 16 == 0) {
1553 return sel2_InvertedListScanner
1554 <DistanceComputerByte<Similarity, SIMDWIDTH> >
1555 (sq, quantizer, store_pairs, r);
1557 return sel2_InvertedListScanner
1558 <DCTemplate<Quantizer8bitDirect<SIMDWIDTH>,
1559 Similarity, SIMDWIDTH> >
1560 (sq, quantizer, store_pairs, r);
1565 FAISS_THROW_MSG (
"unknown qtype");
1569 template<
int SIMDWIDTH>
1570 InvertedListScanner* sel0_InvertedListScanner
1572 const Index *quantizer,
bool store_pairs,
bool by_residual)
1574 if (mt == METRIC_L2) {
1575 return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH> >
1576 (sq, quantizer, store_pairs, by_residual);
1578 return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH> >
1579 (sq, quantizer, store_pairs, by_residual);
1584 InvertedListScanner* select_InvertedListScanner
1586 const Index *quantizer,
bool store_pairs,
bool by_residual=
false)
1589 if (sq->d % 8 == 0) {
1590 return sel0_InvertedListScanner<8>
1591 (mt, sq, quantizer, store_pairs, by_residual);
1595 return sel0_InvertedListScanner<1>
1596 (mt, sq, quantizer, store_pairs, by_residual);
1608 IndexScalarQuantizer::IndexScalarQuantizer
1615 qtype == ScalarQuantizer::QT_fp16 ||
1616 qtype == ScalarQuantizer::QT_8bit_direct;
1617 code_size = sq.code_size;
1621 IndexScalarQuantizer::IndexScalarQuantizer ():
1634 codes.resize ((n +
ntotal) * code_size);
1635 sq.compute_codes (x, &codes[
ntotal * code_size], n);
1645 idx_t* labels)
const
1649 #pragma omp parallel
1656 for (
size_t i = 0; i < n; i++) {
1657 float * D = distances + k * i;
1658 idx_t * I = labels + k * i;
1661 maxheap_heapify (k, D, I);
1663 minheap_heapify (k, D, I);
1671 maxheap_reorder (k, D, I);
1673 minheap_reorder (k, D, I);
1684 dc->code_size = sq.code_size;
1685 dc->codes = codes.data();
1697 idx_t i0, idx_t ni,
float* recons)
const
1699 Quantizer *squant = select_quantizer (sq);
1701 for (
size_t i = 0; i < ni; i++) {
1702 squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
1716 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
1717 (
Index *quantizer,
size_t d,
size_t nlist,
1719 IndexIVF (quantizer, d, nlist, 0, metric),
1722 code_size = sq.code_size;
1724 invlists->code_size = code_size;
1729 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
1737 const float * x_in = x;
1741 d, (
size_t*)&n, 100000,
1747 long * idx =
new long [n];
1749 quantizer->assign (n, x, idx);
1750 float *residuals =
new float [n *
d];
1753 #pragma omp parallel for
1754 for (idx_t i = 0; i < n; i++) {
1755 quantizer->compute_residual (x + i * d, residuals + i * d, idx[i]);
1757 sq.train (n, residuals);
1765 const idx_t *list_nos,
1766 uint8_t * codes)
const
1768 Quantizer *squant = select_quantizer (sq);
1772 #pragma omp parallel
1774 std::vector<float> residual (d);
1778 for (
size_t i = 0; i < n; i++) {
1779 long list_no = list_nos [i];
1781 const float *xi = x + i *
d;
1783 quantizer->compute_residual (
1784 xi, residual.data(), list_no);
1785 xi = residual.data ();
1787 squant->encode_vector (xi, codes + i *
code_size);
1796 (idx_t n,
const float * x,
const long *xids)
1798 FAISS_THROW_IF_NOT (is_trained);
1799 long * idx =
new long [n];
1801 quantizer->assign (n, x, idx);
1803 Quantizer *squant = select_quantizer (sq);
1806 #pragma omp parallel reduction(+: nadd)
1808 std::vector<float> residual (d);
1809 std::vector<uint8_t> one_code (code_size);
1810 int nt = omp_get_num_threads();
1811 int rank = omp_get_thread_num();
1814 for (
size_t i = 0; i < n; i++) {
1815 long list_no = idx [i];
1816 if (list_no >= 0 && list_no % nt == rank) {
1817 long id = xids ? xids[i] : ntotal + i;
1819 const float * xi = x + i * d;
1821 quantizer->compute_residual (xi, residual.data(), list_no);
1822 xi = residual.data();
1825 memset (one_code.data(), 0, code_size);
1826 squant->encode_vector (xi, one_code.data());
1828 invlists->add_entry (list_no,
id, one_code.data());
1843 (
bool store_pairs)
const
1845 return select_InvertedListScanner (metric_type, &sq, quantizer, store_pairs,
1852 float* recons)
const
1854 std::vector<float> centroid(d);
1855 quantizer->reconstruct (list_no, centroid.data());
1858 sq.decode (code, recons, 1);
1859 for (
int i = 0; i <
d; ++i) {
1860 recons[i] += centroid[i];
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
size_t code_size
bytes per vector
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void train_residual(idx_t n, const float *x) override
alternate optimization of reconstruction error
same, shared range for all dimensions
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void add(idx_t n, const float *x) override
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
default implementation that calls encode_vectors
long idx_t
all indices are this type
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
idx_t ntotal
total nb of indexed vectors
[mean - std * rs, mean + std * rs]
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
InvertedLists * invlists
Acess to the actual data.
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void reconstruct(idx_t key, float *recons) const override
[min - rs*(max-min), max + rs*(max-min)]
std::vector< float > trained
trained values (including the range)
bool is_trained
set if the Index does not require training, or if training is done already
void train(idx_t n, const float *x) override
size_t d
dimension of input vectors
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.