9 #include "IndexIVFScalarQuantizer.h"
16 #include <immintrin.h>
20 #include "FaissAssert.h"
59 static void encode_component (
float x, uint8_t *code,
int i) {
60 code[i] = (int)(255 * x);
63 static float decode_component (
const uint8_t *code,
int i) {
64 return (code[i] + 0.5f) / 255.0f;
68 static __m256 decode_8_components (
const uint8_t *code,
int i) {
69 uint64_t c8 = *(uint64_t*)(code + i);
70 __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
71 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
73 __m256i i8 = _mm256_castsi128_si256 (c4lo);
74 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
75 __m256 f8 = _mm256_cvtepi32_ps (i8);
76 __m256 half = _mm256_set1_ps (0.5f);
78 __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
87 static void encode_component (
float x, uint8_t *code,
int i) {
88 code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
91 static float decode_component (
const uint8_t *code,
int i) {
92 return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
97 static __m256 decode_8_components (
const uint8_t *code,
int i) {
98 uint32_t c4 = *(uint32_t*)(code + (i >> 1));
99 uint32_t mask = 0x0f0f0f0f;
100 uint32_t c4ev = c4 & mask;
101 uint32_t c4od = (c4 >> 4) & mask;
104 __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
105 _mm_set1_epi32(c4od));
106 __m128i c4lo = _mm_cvtepu8_epi32 (c8);
107 __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
108 __m256i i8 = _mm256_castsi128_si256 (c4lo);
109 i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
110 __m256 f8 = _mm256_cvtepi32_ps (i8);
111 __m256 half = _mm256_set1_ps (0.5f);
113 __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
125 struct SimilarityL2 {
127 explicit SimilarityL2 (
const float * y): y(y) {}
139 void add_component (
float x) {
140 float tmp = *yi++ - x;
154 accu8 = _mm256_setzero_ps();
158 void add_8_components (__m256 x) {
159 __m256 yiv = _mm256_loadu_ps (yi);
161 __m256 tmp = yiv - x;
166 __m256 sum = _mm256_hadd_ps(accu8, accu8);
167 __m256 sum2 = _mm256_hadd_ps(sum, sum);
170 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
171 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
176 struct SimilarityIP {
184 SimilarityIP (
const float * y,
float accu0):
185 y (y), accu0 (accu0) {}
192 void add_component (
float x) {
206 accu8 = _mm256_setzero_ps();
210 void add_8_components (__m256 x) {
211 __m256 yiv = _mm256_loadu_ps (yi);
217 __m256 sum = _mm256_hadd_ps(accu8, accu8);
218 __m256 sum2 = _mm256_hadd_ps(sum, sum);
222 _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
223 _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
234 template<
class Quantizer,
class Similarity>
235 float compute_distance(
const Quantizer & quant, Similarity & sim,
239 for (
size_t i = 0; i < quant.d; i++) {
240 float xi = quant.reconstruct_component (code, i);
241 sim.add_component (xi);
247 template<
class Quantizer,
class Similarity>
248 float compute_distance_8(
const Quantizer & quant, Similarity & sim,
252 for (
size_t i = 0; i < quant.d; i += 8) {
253 __m256 xi = quant.reconstruct_8_components (code, i);
254 sim.add_8_components (xi);
256 return sim.result_8();
265 static float sqr (
float x) {
270 void train_Uniform(RangeStat rs,
float rs_arg,
271 idx_t n,
int k,
const float *x,
272 std::vector<float> & trained)
275 float & vmin = trained[0];
276 float & vmax = trained[1];
279 vmin = HUGE_VAL; vmax = -HUGE_VAL;
280 for (
size_t i = 0; i < n; i++) {
281 if (x[i] < vmin) vmin = x[i];
282 if (x[i] > vmax) vmax = x[i];
284 float vexp = (vmax - vmin) * rs_arg;
288 double sum = 0, sum2 = 0;
289 for (
size_t i = 0; i < n; i++) {
293 float mean = sum / n;
294 float var = sum2 / n - mean * mean;
295 float std = var <= 0 ? 1.0 : sqrt(var);
297 vmin = mean - std * rs_arg ;
298 vmax = mean + std * rs_arg ;
300 std::vector<float> x_copy(n);
301 memcpy(x_copy.data(), x, n *
sizeof(*x));
303 std::sort(x_copy.begin(), x_copy.end());
304 int o = int(rs_arg * n);
306 if (o > n - o) o = n / 2;
308 vmax = x_copy[n - 1 - o];
314 vmin = HUGE_VAL, vmax = -HUGE_VAL;
315 for (
size_t i = 0; i < n; i++) {
316 if (x[i] < vmin) vmin = x[i];
317 if (x[i] > vmax) vmax = x[i];
321 a = (vmax - vmin) / (k - 1);
326 int iter_last_err = 0;
327 for (
int it = 0; it < niter; it++) {
328 float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
330 for (idx_t i = 0; i < n; i++) {
332 float ni = floor ((xi - b) / a + 0.5);
334 if (ni >= k) ni = k - 1;
335 err1 += sqr (xi - (ni * a + b));
341 if (err1 == last_err) {
343 if (iter_last_err == 16)
break;
349 float det = sqr (sn) - sn2 * n;
351 b = (sn * sxn - sn2 * sx) / det;
352 a = (sn * sx - n * sxn) / det;
354 printf (
"it %d, err1=%g \r", it, err1);
358 if (verbose) printf(
"\n");
361 vmax = b + a * (k - 1);
364 FAISS_THROW_MSG (
"Invalid qtype");
369 void train_NonUniform(RangeStat rs,
float rs_arg,
370 idx_t n,
int d,
int k,
const float *x,
371 std::vector<float> & trained)
373 trained.resize (2 * d);
374 float * vmin = trained.data();
375 float * vmax = trained.data() + d;
377 memcpy (vmin, x,
sizeof(*x) * d);
378 memcpy (vmax, x,
sizeof(*x) * d);
379 for (
size_t i = 1; i < n; i++) {
380 const float *xi = x + i * d;
381 for (
size_t j = 0; j < d; j++) {
382 if (xi[j] < vmin[j]) vmin[j] = xi[j];
383 if (xi[j] > vmax[j]) vmax[j] = xi[j];
387 for (
size_t j = 0; j < d; j++) {
388 float vexp = (vmax[j] - vmin[j]) * rs_arg;
391 vdiff [j] = vmax[j] - vmin[j];
395 std::vector<float> xt(n * d);
396 for (
size_t i = 1; i < n; i++) {
397 const float *xi = x + i * d;
398 for (
size_t j = 0; j < d; j++) {
399 xt[j * n + i] = xi[j];
402 std::vector<float> trained_d(2);
403 #pragma omp parallel for
404 for (
size_t j = 0; j < d; j++) {
405 train_Uniform(rs, rs_arg,
406 n, k, xt.data() + j * n,
408 vmin[j] = trained_d[0];
409 vmax[j] = trained_d[1];
423 virtual void encode_vector(
const float *x, uint8_t *code)
const = 0;
424 virtual void decode_vector(
const uint8_t *code,
float *x)
const = 0;
426 virtual float compute_distance_L2 (SimilarityL2 &sim,
427 const uint8_t * codes)
const = 0;
428 virtual float compute_distance_IP (SimilarityIP &sim,
429 const uint8_t * codes)
const = 0;
431 virtual ~Quantizer() {}
437 template<
class Codec>
438 struct QuantizerUniform: Quantizer {
440 const float vmin, vdiff;
442 QuantizerUniform(
size_t d,
const std::vector<float> &trained):
443 d(d), vmin(trained[0]), vdiff(trained[1]) {
446 void encode_vector(
const float* x, uint8_t* code)
const override {
447 for (
size_t i = 0; i < d; i++) {
448 float xi = (x[i] - vmin) / vdiff;
453 Codec::encode_component(xi, code, i);
457 void decode_vector(
const uint8_t* code,
float* x)
const override {
458 for (
size_t i = 0; i < d; i++) {
459 float xi = Codec::decode_component(code, i);
460 x[i] = vmin + xi * vdiff;
464 float reconstruct_component (
const uint8_t * code,
int i)
const
466 float xi = Codec::decode_component (code, i);
467 return vmin + xi * vdiff;
471 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
473 __m256 xi = Codec::decode_8_components (code, i);
474 return _mm256_set1_ps(vmin) + xi * _mm256_set1_ps (vdiff);
478 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
480 return compute_distance(*
this, sim, codes);
483 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
485 return compute_distance(*
this, sim, codes);
490 template<
class Codec>
491 struct QuantizerUniform8: QuantizerUniform<Codec> {
493 QuantizerUniform8 (
size_t d,
const std::vector<float> &trained):
494 QuantizerUniform<Codec> (d, trained) {}
496 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
498 return compute_distance_8(*
this, sim, codes);
501 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
503 return compute_distance_8(*
this, sim, codes);
512 template<
class Codec>
513 struct QuantizerNonUniform: Quantizer {
515 const float *vmin, *vdiff;
517 QuantizerNonUniform(
size_t d,
const std::vector<float> &trained):
518 d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
520 void encode_vector(
const float* x, uint8_t* code)
const override {
521 for (
size_t i = 0; i < d; i++) {
522 float xi = (x[i] - vmin[i]) / vdiff[i];
527 Codec::encode_component(xi, code, i);
531 void decode_vector(
const uint8_t* code,
float* x)
const override {
532 for (
size_t i = 0; i < d; i++) {
533 float xi = Codec::decode_component(code, i);
534 x[i] = vmin[i] + xi * vdiff[i];
538 float reconstruct_component (
const uint8_t * code,
int i)
const
540 float xi = Codec::decode_component (code, i);
541 return vmin[i] + xi * vdiff[i];
545 __m256 reconstruct_8_components (
const uint8_t * code,
int i)
const
547 __m256 xi = Codec::decode_8_components (code, i);
548 return _mm256_loadu_ps(vmin + i) + xi * _mm256_loadu_ps (vdiff + i);
552 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
554 return compute_distance(*
this, sim, codes);
557 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
559 return compute_distance(*
this, sim, codes);
564 template<
class Codec>
565 struct QuantizerNonUniform8: QuantizerNonUniform<Codec> {
567 QuantizerNonUniform8 (
size_t d,
const std::vector<float> &trained):
568 QuantizerNonUniform<Codec> (d, trained) {}
570 float compute_distance_L2(SimilarityL2& sim,
const uint8_t* codes)
572 return compute_distance_8(*
this, sim, codes);
575 float compute_distance_IP(SimilarityIP& sim,
const uint8_t* codes)
577 return compute_distance_8(*
this, sim, codes);
586 Quantizer *select_quantizer (
588 size_t d,
const std::vector<float> & trained)
594 return new QuantizerNonUniform8<Codec8bit>(d, trained);
596 return new QuantizerNonUniform8<Codec4bit>(d, trained);
598 return new QuantizerUniform8<Codec8bit>(d, trained);
599 case ScalarQuantizer::QT_4bit_uniform:
600 return new QuantizerUniform8<Codec4bit>(d, trained);
607 return new QuantizerNonUniform<Codec8bit>(d, trained);
609 return new QuantizerNonUniform<Codec4bit>(d, trained);
611 return new QuantizerUniform<Codec8bit>(d, trained);
612 case ScalarQuantizer::QT_4bit_uniform:
613 return new QuantizerUniform<Codec4bit>(d, trained);
616 FAISS_THROW_MSG (
"unknown qtype");
620 Quantizer *select_quantizer (
const ScalarQuantizer &sq)
622 return select_quantizer (sq.qtype, sq.d, sq.trained);
634 ScalarQuantizer::ScalarQuantizer
635 (
size_t d, QuantizerType qtype):
636 qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
639 case QT_8bit:
case QT_8bit_uniform:
642 case QT_4bit:
case QT_4bit_uniform:
649 ScalarQuantizer::ScalarQuantizer ():
651 rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
654 void ScalarQuantizer::train (
size_t n,
const float *x)
657 qtype == QT_4bit_uniform ? 4 :
664 train_Uniform (rangestat, rangestat_arg,
665 n * d, 1 << bit_per_dim, x, trained);
668 train_NonUniform (rangestat, rangestat_arg,
669 n, d, 1 << bit_per_dim, x, trained);
678 Quantizer *squant = select_quantizer (*
this);
679 #pragma omp parallel for
680 for (
size_t i = 0; i < n; i++)
681 squant->encode_vector (x + i * d, codes + i *
code_size);
687 Quantizer *squant = select_quantizer (*
this);
688 #pragma omp parallel for
689 for (
size_t i = 0; i < n; i++)
690 squant->decode_vector (codes + i *
code_size, x + i * d);
700 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
701 (
Index *quantizer,
size_t d,
size_t nlist,
703 IndexIVF (quantizer, d, nlist, metric),
710 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
711 IndexIVF (), code_size (0)
716 long * idx =
new long [n];
718 quantizer->
assign (n, x, idx);
719 float *residuals =
new float [n *
d];
722 #pragma omp parallel for
723 for (idx_t i = 0; i < n; i++) {
727 sq.train (n, residuals);
733 (idx_t n,
const float * x,
const long *xids)
735 FAISS_THROW_IF_NOT (is_trained);
736 long * idx =
new long [n];
738 quantizer->
assign (n, x, idx);
740 Quantizer *squant = select_quantizer (sq);
743 #pragma omp parallel reduction(+: nadd)
745 std::vector<float> residual (d);
746 int nt = omp_get_num_threads();
747 int rank = omp_get_thread_num();
749 for (
size_t i = 0; i < n; i++) {
751 long list_no = idx [i];
752 if (list_no >= 0 && list_no % nt == rank) {
753 long id = xids ? xids[i] : ntotal + i;
755 assert (list_no < nlist);
757 ids[list_no].push_back (
id);
760 x + i * d, residual.data(), list_no);
762 size_t cur_size = codes[list_no].size();
763 codes[list_no].resize (cur_size + code_size);
765 squant->encode_vector (residual.data(),
766 codes[list_no].data() + cur_size);
776 const idx_t *cent_ids,
const float *cent_dis,
777 const Quantizer & quant,
778 int k,
float *simi, idx_t *idxi)
780 int nprobe = index.
nprobe;
781 size_t code_size = index.code_size;
783 std::vector<float> decoded(d);
784 minheap_heapify (k, simi, idxi);
785 for (
int i = 0; i < nprobe; i++) {
786 idx_t list_no = cent_ids[i];
787 if (list_no < 0)
break;
788 float accu0 = cent_dis[i];
790 const std::vector<idx_t> & ids = index.
ids[list_no];
791 const uint8_t* codes = index.
codes[list_no].data();
793 SimilarityIP sim(x, accu0);
795 for (
size_t j = 0; j < ids.size(); j++) {
797 float accu = quant.compute_distance_IP(sim, codes);
799 if (accu > simi [0]) {
800 minheap_pop (k, simi, idxi);
801 minheap_push (k, simi, idxi, accu, ids[j]);
807 minheap_reorder (k, simi, idxi);
810 void search_with_probes_L2 (
const IndexIVFScalarQuantizer & index,
812 const idx_t *cent_ids,
813 const Index *quantizer,
814 const Quantizer & quant,
815 int k,
float *simi, idx_t *idxi)
817 int nprobe = index.nprobe;
818 size_t code_size = index.code_size;
820 std::vector<float> decoded(d), x(d);
821 maxheap_heapify (k, simi, idxi);
822 for (
int i = 0; i < nprobe; i++) {
823 idx_t list_no = cent_ids[i];
824 if (list_no < 0)
break;
826 const std::vector<idx_t> & ids = index.ids[list_no];
827 const uint8_t* codes = index.codes[list_no].data();
830 quantizer->compute_residual (x_in, x.data(), list_no);
832 SimilarityL2 sim(x.data());
834 for (
size_t j = 0; j < ids.size(); j++) {
836 float dis = quant.compute_distance_L2 (sim, codes);
838 if (dis < simi [0]) {
839 maxheap_pop (k, simi, idxi);
840 maxheap_push (k, simi, idxi, dis, ids[j]);
845 maxheap_reorder (k, simi, idxi);
850 float *distances, idx_t *labels)
const
853 idx_t *idx =
new idx_t [n *
nprobe];
855 float *dis =
new float [n *
nprobe];
858 quantizer->search (n, x, nprobe, dis, idx);
860 Quantizer *squant = select_quantizer (sq);
864 #pragma omp parallel for
865 for (
size_t i = 0; i < n; i++) {
866 search_with_probes_ip (*
this, x + i * d,
867 idx + i * nprobe, dis + i * nprobe, *squant,
868 k, distances + i * k, labels + i * k);
871 #pragma omp parallel for
872 for (
size_t i = 0; i < n; i++) {
873 search_with_probes_L2 (*
this, x + i * d,
874 idx + i * nprobe, quantizer, *squant,
875 k, distances + i * k, labels + i * k);
885 for (
int i = 0; i <
nlist; i++) {
886 std::vector<uint8_t> & src = other.
codes[i];
887 std::vector<uint8_t> & dest = codes[i];
888 dest.insert (dest.end(), src.begin (), src.end ());
size_t code_size
bytes per vector
void train_residual(idx_t n, const float *x) override
size_t nprobe
number of probes at query time
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void merge_from_residuals(IndexIVF &other) override
alternate optimization of reconstruction error
std::vector< std::vector< long > > ids
Inverted lists for indexes.
void add_with_ids(idx_t n, const float *x, const long *xids) override
long idx_t
all indices are this type
same, shared range for all dimensions
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
[min - rs*(max-min), max + rs*(max-min)]
size_t nlist
number of possible key values
std::vector< std::vector< uint8_t > > codes
inverted list codes.
size_t code_size
code size per vector in bytes
[mean - std * rs, mean + std * rs]
bool is_trained
set if the Index does not require training, or if training is done already
void compute_residual(const float *x, float *residual, idx_t key) const
MetricType
Some algorithms support both an inner product vetsion and a L2 search version.