9 #include "IndexScalarQuantizer.h"
18 #include <immintrin.h>
22 #include "FaissAssert.h"
51 using DistanceComputer = ScalarQuantizer::DistanceComputer;
62 static void encode_component (
float x, uint8_t *code,
int i) {
63 code[i] = (int)(255 * x);
66 static float decode_component (
const uint8_t *code,
int i) {
67 return (code[i] + 0.5f) / 255.0f;
71 static __m256 decode_8_components (
const uint8_t *code,
int i) {
72 uint64_t c8 = *(uint64_t*)(code + i);
73 __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
74 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
76 __m256i i8 = _mm256_castsi128_si256 (c4lo);
77 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
78 __m256 f8 = _mm256_cvtepi32_ps (i8);
79 __m256 half = _mm256_set1_ps (0.5f);
81 __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
90 static void encode_component (
float x, uint8_t *code,
int i) {
91 code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
94 static float decode_component (
const uint8_t *code,
int i) {
95 return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
100 static __m256 decode_8_components (
const uint8_t *code,
int i) {
101 uint32_t c4 = *(uint32_t*)(code + (i >> 1));
102 uint32_t mask = 0x0f0f0f0f;
103 uint32_t c4ev = c4 & mask;
104 uint32_t c4od = (c4 >> 4) & mask;
107 __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
108 _mm_set1_epi32(c4od));
109 __m128i c4lo = _mm_cvtepu8_epi32 (c8);
110 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
111 __m256i i8 = _mm256_castsi128_si256 (c4lo);
112 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
113 __m256 f8 = _mm256_cvtepi32_ps (i8);
114 __m256 half = _mm256_set1_ps (0.5f);
116 __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
132 virtual void encode_vector(
const float *x, uint8_t *code)
const = 0;
133 virtual void decode_vector(
const uint8_t *code,
float *x)
const = 0;
136 virtual ~Quantizer() {}
143 template<
class Codec>
144 struct QuantizerUniform: Quantizer {
146 const float vmin, vdiff;
148 QuantizerUniform(
size_t d,
const std::vector<float> &trained):
149 d(d), vmin(trained[0]), vdiff(trained[1])
153 void encode_vector(
const float* x, uint8_t* code)
const override {
154 for (
size_t i = 0; i < d; i++) {
155 float xi = (x[i] - vmin) / vdiff;
160 Codec::encode_component(xi, code, i);
164 void decode_vector(
const uint8_t* code,
float* x)
const override {
165 for (
size_t i = 0; i < d; i++) {
166 float xi = Codec::decode_component(code, i);
167 x[i] = vmin + xi * vdiff;
171 float reconstruct_component (
const uint8_t * code,
int i)
const
173 float xi = Codec::decode_component (code, i);
174 return vmin + xi * vdiff;
183 template<
class Codec>
184 struct QuantizerUniform8: QuantizerUniform<Codec> {
186 QuantizerUniform8 (
size_t d,
const std::vector<float> &trained):
187 QuantizerUniform<Codec> (d, trained) {}
189 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
191 __m256 xi = Codec::decode_8_components (code, i);
192 return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
201 template<
class Codec>
202 struct QuantizerNonUniform: Quantizer {
204 const float *vmin, *vdiff;
206 QuantizerNonUniform(
size_t d,
const std::vector<float> &trained):
207 d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
209 void encode_vector(
const float* x, uint8_t* code)
const override {
210 for (
size_t i = 0; i < d; i++) {
211 float xi = (x[i] - vmin[i]) / vdiff[i];
216 Codec::encode_component(xi, code, i);
220 void decode_vector(
const uint8_t* code,
float* x)
const override {
221 for (
size_t i = 0; i < d; i++) {
222 float xi = Codec::decode_component(code, i);
223 x[i] = vmin[i] + xi * vdiff[i];
227 float reconstruct_component (
const uint8_t * code,
int i)
const
229 float xi = Codec::decode_component (code, i);
230 return vmin[i] + xi * vdiff[i];
238 template<
class Codec>
239 struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
241 QuantizerNonUniform8 (
size_t d,
const std::vector<float> &trained):
242 QuantizerNonUniform<Codec> (d, trained) {}
244 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
246 __m256 xi = Codec::decode_8_components (code, i);
247 return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
255 Quantizer *select_quantizer (
257 size_t d,
const std::vector<float> & trained)
263 return new QuantizerNonUniform8<Codec8bit>(d, trained);
265 return new QuantizerNonUniform8<Codec4bit>(d, trained);
267 return new QuantizerUniform8<Codec8bit>(d, trained);
268 case ScalarQuantizer::QT_4bit_uniform:
269 return new QuantizerUniform8<Codec4bit>(d, trained);
276 return new QuantizerNonUniform<Codec8bit>(d, trained);
278 return new QuantizerNonUniform<Codec4bit>(d, trained);
280 return new QuantizerUniform<Codec8bit>(d, trained);
281 case ScalarQuantizer::QT_4bit_uniform:
282 return new QuantizerUniform<Codec4bit>(d, trained);
285 FAISS_THROW_MSG (
"unknown qtype");
290 Quantizer *select_quantizer (
const ScalarQuantizer &sq)
292 return select_quantizer (sq.qtype, sq.d, sq.trained);
302 static float sqr (
float x) {
307 void train_Uniform(RangeStat rs,
float rs_arg,
308 idx_t n,
int k,
const float *x,
309 std::vector<float> & trained)
312 float & vmin = trained[0];
313 float & vmax = trained[1];
316 vmin = HUGE_VAL; vmax = -HUGE_VAL;
317 for (
size_t i = 0; i < n; i++) {
318 if (x[i] < vmin) vmin = x[i];
319 if (x[i] > vmax) vmax = x[i];
321 float vexp = (vmax - vmin) * rs_arg;
325 double sum = 0, sum2 = 0;
326 for (
size_t i = 0; i < n; i++) {
330 float mean = sum / n;
331 float var = sum2 / n - mean * mean;
332 float std = var <= 0 ? 1.0 : sqrt(var);
334 vmin = mean - std * rs_arg ;
335 vmax = mean + std * rs_arg ;
337 std::vector<float> x_copy(n);
338 memcpy(x_copy.data(), x, n *
sizeof(*x));
340 std::sort(x_copy.begin(), x_copy.end());
341 int o = int(rs_arg * n);
343 if (o > n - o) o = n / 2;
345 vmax = x_copy[n - 1 - o];
351 vmin = HUGE_VAL, vmax = -HUGE_VAL;
352 for (
size_t i = 0; i < n; i++) {
353 if (x[i] < vmin) vmin = x[i];
354 if (x[i] > vmax) vmax = x[i];
358 a = (vmax - vmin) / (k - 1);
363 int iter_last_err = 0;
364 for (
int it = 0; it < niter; it++) {
365 float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
367 for (idx_t i = 0; i < n; i++) {
369 float ni = floor ((xi - b) / a + 0.5);
371 if (ni >= k) ni = k - 1;
372 err1 += sqr (xi - (ni * a + b));
378 if (err1 == last_err) {
380 if (iter_last_err == 16)
break;
386 float det = sqr (sn) - sn2 * n;
388 b = (sn * sxn - sn2 * sx) / det;
389 a = (sn * sx - n * sxn) / det;
391 printf (
"it %d, err1=%g \r", it, err1);
395 if (verbose) printf(
"\n");
398 vmax = b + a * (k - 1);
401 FAISS_THROW_MSG (
"Invalid qtype");
406 void train_NonUniform(RangeStat rs,
float rs_arg,
407 idx_t n,
int d,
int k,
const float *x,
408 std::vector<float> & trained)
410 trained.resize (2 * d);
411 float * vmin = trained.data();
412 float * vmax = trained.data() + d;
414 memcpy (vmin, x,
sizeof(*x) * d);
415 memcpy (vmax, x,
sizeof(*x) * d);
416 for (
size_t i = 1; i < n; i++) {
417 const float *xi = x + i * d;
418 for (
size_t j = 0; j < d; j++) {
419 if (xi[j] < vmin[j]) vmin[j] = xi[j];
420 if (xi[j] > vmax[j]) vmax[j] = xi[j];
424 for (
size_t j = 0; j < d; j++) {
425 float vexp = (vmax[j] - vmin[j]) * rs_arg;
428 vdiff [j] = vmax[j] - vmin[j];
432 std::vector<float> xt(n * d);
433 for (
size_t i = 1; i < n; i++) {
434 const float *xi = x + i * d;
435 for (
size_t j = 0; j < d; j++) {
436 xt[j * n + i] = xi[j];
439 std::vector<float> trained_d(2);
440 #pragma omp parallel for
441 for (
size_t j = 0; j < d; j++) {
442 train_Uniform(rs, rs_arg,
443 n, k, xt.data() + j * n,
445 vmin[j] = trained_d[0];
446 vmax[j] = trained_d[1];
459 struct SimilarityL2 {
462 explicit SimilarityL2 (
const float * y): y(y) {}
473 void add_component (
float x) {
474 float tmp = *yi++ - x;
478 void add_component_2 (
float x1,
float x2) {
491 accu8 = _mm256_setzero_ps();
495 void add_8_components (__m256 x) {
496 __m256 yiv = _mm256_loadu_ps (yi);
498 __m256 tmp = yiv - x;
502 void add_8_components_2 (__m256 x, __m256 y) {
508 __m256 sum = _mm256_hadd_ps(accu8, accu8);
509 __m256 sum2 = _mm256_hadd_ps(sum, sum);
512 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
513 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
520 struct SimilarityIP {
526 explicit SimilarityIP (
const float * y):
534 void add_component (
float x) {
538 void add_component_2 (
float x1,
float x2) {
551 accu8 = _mm256_setzero_ps();
555 void add_8_components (__m256 x) {
556 __m256 yiv = _mm256_loadu_ps (yi);
561 void add_8_components_2 (__m256 x1, __m256 x2) {
566 __m256 sum = _mm256_hadd_ps(accu8, accu8);
567 __m256 sum2 = _mm256_hadd_ps(sum, sum);
570 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
571 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
583 template<
class Quantizer,
class Similarity>
584 struct DCTemplate : ScalarQuantizer::DistanceComputer {
588 DCTemplate(
size_t d,
const std::vector<float> &trained):
592 float compute_distance (
const float *x,
593 const uint8_t *code)
override
597 for (
size_t i = 0; i < quant.d; i ++) {
598 float xi = quant.reconstruct_component (code, i);
599 sim.add_component (xi);
604 float compute_code_distance (
const uint8_t *code1,
605 const uint8_t *code2)
override
607 Similarity sim(
nullptr);
609 for (
size_t i = 0; i < quant.d; i ++) {
610 float x1 = quant.reconstruct_component (code1, i);
611 float x2 = quant.reconstruct_component (code2, i);
612 sim.add_component_2 (x1, x2);
614 return sim.result ();
621 template<
class Quantizer,
class Similarity>
622 struct DCTemplate_8 : ScalarQuantizer::DistanceComputer {
626 DCTemplate_8(
size_t d,
const std::vector<float> &trained):
630 float compute_distance (
const float *x,
631 const uint8_t *code)
override
635 for (
size_t i = 0; i < quant.d; i += 8) {
636 __m256 xi = quant.reconstruct_8_components (code, i);
637 sim.add_8_components (xi);
639 return sim.result_8();
642 float compute_code_distance (
const uint8_t *code1,
643 const uint8_t *code2)
override
645 Similarity sim(
nullptr);
647 for (
size_t i = 0; i < quant.d; i += 8) {
648 __m256 x1 = quant.reconstruct_8_components (code1, i);
649 __m256 x2 = quant.reconstruct_8_components (code2, i);
650 sim.add_8_components_2 (x1, x2);
652 return sim.result_8 ();
663 DistanceComputer *select_distance_computer (
665 size_t d,
const std::vector<float> & trained)
671 return new DCTemplate_8<QuantizerNonUniform8
672 <Codec8bit>, Sim>(d, trained);
674 return new DCTemplate_8<QuantizerNonUniform8
675 <Codec4bit>, Sim>(d, trained);
677 return new DCTemplate_8<QuantizerUniform8
678 <Codec8bit>, Sim>(d, trained);
679 case ScalarQuantizer::QT_4bit_uniform:
680 return new DCTemplate_8<QuantizerUniform8
681 <Codec4bit>, Sim>(d, trained);
688 return new DCTemplate<QuantizerNonUniform
689 <Codec8bit>, Sim>(d, trained);
691 return new DCTemplate<QuantizerNonUniform
692 <Codec4bit>, Sim>(d, trained);
694 return new DCTemplate<QuantizerUniform
695 <Codec8bit>, Sim>(d, trained);
696 case ScalarQuantizer::QT_4bit_uniform:
697 return new DCTemplate<QuantizerUniform
698 <Codec4bit>, Sim>(d, trained);
701 FAISS_THROW_MSG (
"unknown qtype");
717 ScalarQuantizer::ScalarQuantizer
718 (
size_t d, QuantizerType qtype):
719 qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
722 case QT_8bit:
case QT_8bit_uniform:
725 case QT_4bit:
case QT_4bit_uniform:
726 code_size = (d + 1) / 2;
732 ScalarQuantizer::ScalarQuantizer ():
734 rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
737 void ScalarQuantizer::train (
size_t n,
const float *x)
740 qtype == QT_4bit_uniform ? 4 :
747 train_Uniform (rangestat, rangestat_arg,
748 n * d, 1 << bit_per_dim, x, trained);
751 train_NonUniform (rangestat, rangestat_arg,
752 n, d, 1 << bit_per_dim, x, trained);
761 Quantizer *squant = select_quantizer (*
this);
763 #pragma omp parallel for
764 for (
size_t i = 0; i < n; i++)
765 squant->encode_vector (x + i * d, codes + i *
code_size);
770 Quantizer *squant = select_quantizer (*
this);
772 #pragma omp parallel for
773 for (
size_t i = 0; i < n; i++)
774 squant->decode_vector (codes + i *
code_size, x + i * d);
782 if (metric == METRIC_L2) {
783 return select_distance_computer<SimilarityL2>(qtype,
d,
trained);
785 return select_distance_computer<SimilarityIP>(qtype,
d,
trained);
794 IndexScalarQuantizer::IndexScalarQuantizer
801 code_size = sq.code_size;
805 IndexScalarQuantizer::IndexScalarQuantizer ():
828 void search_flat_scalar_quantizer(
836 size_t code_size = index.code_size;
841 DistanceComputer *dc =
846 for (
size_t i = 0; i < n; i++) {
847 idx_t *idxi = labels + i * k;
848 float *simi = distances + i * k;
849 heap_heapify<C> (k, simi, idxi);
851 const float *xi = x + i * d;
852 const uint8_t *ci = index.
codes.data ();
854 for (
size_t j = 0; j < index.
ntotal; j++) {
855 float accu = dc->compute_distance(xi, ci);
856 if (C::cmp (simi [0], accu)) {
857 heap_pop<C> (k, simi, idxi);
858 heap_push<C> (k, simi, idxi, accu, j);
862 heap_reorder<C> (k, simi, idxi);
879 search_flat_scalar_quantizer<CMax<float, idx_t> > (*
this, n, x, k, distances, labels);
881 search_flat_scalar_quantizer<CMin<float, idx_t> > (*
this, n, x, k, distances, labels);
892 idx_t i0, idx_t ni,
float* recons)
const
894 Quantizer *squant = select_quantizer (
sq);
896 for (
size_t i = 0; i < ni; i++) {
897 squant->decode_vector(&
codes[(i + i0) * code_size], recons + i * d);
911 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
912 (
Index *quantizer,
size_t d,
size_t nlist,
914 IndexIVF (quantizer, d, nlist, 0, metric),
919 invlists->code_size = code_size;
923 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
929 long * idx =
new long [n];
932 float *residuals =
new float [n *
d];
935 #pragma omp parallel for
936 for (idx_t i = 0; i < n; i++) {
940 sq.train (n, residuals);
946 (idx_t n,
const float * x,
const long *xids)
948 FAISS_THROW_IF_NOT (is_trained);
949 long * idx =
new long [n];
951 quantizer->assign (n, x, idx);
953 Quantizer *squant = select_quantizer (sq);
956 #pragma omp parallel reduction(+: nadd)
958 std::vector<float> residual (d);
959 std::vector<uint8_t> one_code (code_size);
960 int nt = omp_get_num_threads();
961 int rank = omp_get_thread_num();
964 for (
size_t i = 0; i < n; i++) {
965 long list_no = idx [i];
966 if (list_no >= 0 && list_no % nt == rank) {
967 long id = xids ? xids[i] : ntotal + i;
969 quantizer->compute_residual (
970 x + i * d, residual.data(), list_no);
972 squant->encode_vector (residual.data(), one_code.data());
974 invlists->add_entry (list_no,
id, one_code.data());
989 const idx_t *cent_ids,
const float *cent_dis,
990 DistanceComputer & dc,
991 int k,
float *simi, idx_t *idxi,
994 int nprobe = index.
nprobe;
997 std::vector<float> decoded(d);
998 minheap_heapify (k, simi, idxi);
1000 for (
int i = 0; i < nprobe; i++) {
1001 idx_t list_no = cent_ids[i];
1002 if (list_no < 0)
break;
1003 float accu0 = cent_dis[i];
1010 SimilarityIP sim(x);
1012 for (
size_t j = 0; j < list_size; j++) {
1014 float accu = accu0 + dc.compute_distance(x, codes);
1016 if (accu > simi [0]) {
1017 minheap_pop (k, simi, idxi);
1018 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1019 minheap_push (k, simi, idxi, accu,
id);
1027 minheap_reorder (k, simi, idxi);
1030 void search_with_probes_L2 (
const IndexIVFScalarQuantizer & index,
1032 const idx_t *cent_ids,
1033 const Index *quantizer,
1034 DistanceComputer & dc,
1035 int k,
float *simi, idx_t *idxi,
1038 int nprobe = index.nprobe;
1039 size_t code_size = index.code_size;
1041 std::vector<float> x(d);
1042 maxheap_heapify (k, simi, idxi);
1044 for (
int i = 0; i < nprobe; i++) {
1045 idx_t list_no = cent_ids[i];
1046 if (list_no < 0)
break;
1048 const size_t list_size = index.invlists->list_size (list_no);
1049 const uint8_t * codes = index.invlists->get_codes (list_no);
1051 store_pairs ?
nullptr : index.invlists->get_ids (list_no);
1054 quantizer->compute_residual (x_in, x.data(), list_no);
1056 for (
size_t j = 0; j < list_size; j++) {
1058 float dis = dc.compute_distance (x.data(), codes);
1060 if (dis < simi [0]) {
1061 maxheap_pop (k, simi, idxi);
1062 long id = store_pairs ? (list_no << 32 | j) : ids[j];
1063 maxheap_push (k, simi, idxi, dis,
id);
1068 if (index.max_codes && nscan > index.max_codes)
1071 maxheap_reorder (k, simi, idxi);
1077 idx_t n,
const float *x, idx_t k,
1080 float *distances, idx_t *labels,
1081 bool store_pairs)
const
1087 #pragma omp parallel
1089 DistanceComputer *dc = sq.get_distance_computer (
metric_type);
1092 for (
size_t i = 0; i < n; i++) {
1093 search_with_probes_ip (*
this, x + i * d,
1094 idx + i * nprobe, dis + i * nprobe, *dc,
1095 k, distances + i * k, labels + i * k,
1100 #pragma omp parallel
1102 DistanceComputer *dc = sq.get_distance_computer (
metric_type);
1105 for (
size_t i = 0; i < n; i++) {
1106 search_with_probes_L2 (*
this, x + i * d,
1107 idx + i * nprobe, quantizer, *dc,
1108 k, distances + i * k, labels + i * k,
1118 float* recons)
const
1120 std::vector<float> centroid(d);
1121 quantizer->reconstruct (list_no, centroid.data());
1124 sq.
decode (code, recons, 1);
1125 for (
int i = 0; i <
d; ++i) {
1126 recons[i] += centroid[i];
size_t code_size
bytes per vector
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void train_residual(idx_t n, const float *x) override
virtual const idx_t * get_ids(size_t list_no) const =0
alternate optimization of reconstruction error
same, shared range for all dimensions
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void add(idx_t n, const float *x) override
size_t nprobe
number of probes at query time
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
virtual size_t list_size(size_t list_no) const =0
get the size of a list
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
ScalarQuantizer sq
Used to encode the vectors.
long idx_t
all indices are this type
idx_t ntotal
total nb of indexed vectors
void search_preassigned(idx_t n, const float *x, idx_t k, const idx_t *assign, const float *centroid_dis, float *distances, idx_t *labels, bool store_pairs) const override
[mean - std * rs, mean + std * rs]
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
InvertedLists * invlists
Acess to the actual data.
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void reconstruct(idx_t key, float *recons) const override
[min - rs*(max-min), max + rs*(max-min)]
std::vector< float > trained
trained values (including the range)
virtual const uint8_t * get_codes(size_t list_no) const =0
Index * quantizer
quantizer that maps vectors to inverted lists
bool is_trained
set if the Index does not require training, or if training is done already
void compute_residual(const float *x, float *residual, idx_t key) const
size_t max_codes
max nb of codes to visit to do a query
void train(idx_t n, const float *x) override
size_t d
dimension of input vectors
size_t code_size
code size per vector in bytes
MetricType
Some algorithms support both an inner product version and a L2 search version.