Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/tmp/faiss/IndexScalarQuantizer.cpp
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // -*- c++ -*-
10 
11 #include "IndexScalarQuantizer.h"
12 
13 #include <cstdio>
14 #include <algorithm>
15 
16 #include <omp.h>
17 
18 #ifdef __SSE__
19 #include <immintrin.h>
20 #endif
21 
22 #include "utils.h"
23 
24 #include "FaissAssert.h"
25 
26 namespace faiss {
27 
28 /*******************************************************************
29  * ScalarQuantizer implementation
30  *
31  * The main source of complexity is to support combinations of 4
32  * variants without incurring runtime tests or virtual function calls:
33  *
34  * - 4 / 8 bits per code component
35  * - uniform / non-uniform
36  * - IP / L2 distance search
37  * - scalar / AVX distance computation
38  *
39  * The appropriate Quantizer object is returned via select_quantizer
40  * that hides the template mess.
41  ********************************************************************/
42 
43 #ifdef __AVX__
44 #define USE_AVX
45 #endif
46 
47 
48 namespace {
49 
50 typedef Index::idx_t idx_t;
51 typedef ScalarQuantizer::QuantizerType QuantizerType;
52 typedef ScalarQuantizer::RangeStat RangeStat;
53 using DistanceComputer = ScalarQuantizer::DistanceComputer;
54 
55 
56 /*******************************************************************
57  * Codec: converts between values in [0, 1] and an index in a code
58  * array. The "i" parameter is the vector component index (not byte
59  * index).
60  */
61 
62 struct Codec8bit {
63 
64  static void encode_component (float x, uint8_t *code, int i) {
65  code[i] = (int)(255 * x);
66  }
67 
68  static float decode_component (const uint8_t *code, int i) {
69  return (code[i] + 0.5f) / 255.0f;
70  }
71 
72 #ifdef USE_AVX
73  static __m256 decode_8_components (const uint8_t *code, int i) {
74  uint64_t c8 = *(uint64_t*)(code + i);
75  __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
76  __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
77  // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
78  __m256i i8 = _mm256_castsi128_si256 (c4lo);
79  i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
80  __m256 f8 = _mm256_cvtepi32_ps (i8);
81  __m256 half = _mm256_set1_ps (0.5f);
82  f8 += half;
83  __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
84  return f8 * one_255;
85  }
86 #endif
87 };
88 
89 
90 struct Codec4bit {
91 
92  static void encode_component (float x, uint8_t *code, int i) {
93  code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
94  }
95 
96  static float decode_component (const uint8_t *code, int i) {
97  return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
98  }
99 
100 
101 #ifdef USE_AVX
102  static __m256 decode_8_components (const uint8_t *code, int i) {
103  uint32_t c4 = *(uint32_t*)(code + (i >> 1));
104  uint32_t mask = 0x0f0f0f0f;
105  uint32_t c4ev = c4 & mask;
106  uint32_t c4od = (c4 >> 4) & mask;
107 
108  // the 8 lower bytes of c8 contain the values
109  __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
110  _mm_set1_epi32(c4od));
111  __m128i c4lo = _mm_cvtepu8_epi32 (c8);
112  __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
113  __m256i i8 = _mm256_castsi128_si256 (c4lo);
114  i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
115  __m256 f8 = _mm256_cvtepi32_ps (i8);
116  __m256 half = _mm256_set1_ps (0.5f);
117  f8 += half;
118  __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
119  return f8 * one_255;
120  }
121 #endif
122 };
123 
124 
125 #ifdef USE_AVX
126 
127 
128 uint16_t encode_fp16 (float x) {
129  __m128 xf = _mm_set1_ps (x);
130  __m128i xi = _mm_cvtps_ph (
131  xf, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
132  return _mm_cvtsi128_si32 (xi) & 0xffff;
133 }
134 
135 
136 float decode_fp16 (uint16_t x) {
137  __m128i xi = _mm_set1_epi16 (x);
138  __m128 xf = _mm_cvtph_ps (xi);
139  return _mm_cvtss_f32 (xf);
140 }
141 
142 #else
143 
144 // non-intrinsic FP16 <-> FP32 code adapted from
145 // https://github.com/ispc/ispc/blob/master/stdlib.ispc
146 
147 float floatbits (uint32_t x) {
148  void *xptr = &x;
149  return *(float*)xptr;
150 }
151 
152 uint32_t intbits (float f) {
153  void *fptr = &f;
154  return *(uint32_t*)fptr;
155 }
156 
157 
158 uint16_t encode_fp16 (float f) {
159 
160  // via Fabian "ryg" Giesen.
161  // https://gist.github.com/2156668
162  uint32_t sign_mask = 0x80000000u;
163  int32_t o;
164 
165  uint32_t fint = intbits(f);
166  uint32_t sign = fint & sign_mask;
167  fint ^= sign;
168 
169  // NOTE all the integer compares in this function can be safely
170  // compiled into signed compares since all operands are below
171  // 0x80000000. Important if you want fast straight SSE2 code (since
172  // there's no unsigned PCMPGTD).
173 
174  // Inf or NaN (all exponent bits set)
175  // NaN->qNaN and Inf->Inf
176  // unconditional assignment here, will override with right value for
177  // the regular case below.
178  uint32_t f32infty = 255u << 23;
179  o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
180 
181  // (De)normalized number or zero
182  // update fint unconditionally to save the blending; we don't need it
183  // anymore for the Inf/NaN case anyway.
184 
185  const uint32_t round_mask = ~0xfffu;
186  const uint32_t magic = 15u << 23;
187 
188  // Shift exponent down, denormalize if necessary.
189  // NOTE This represents half-float denormals using single
190  // precision denormals. The main reason to do this is that
191  // there's no shift with per-lane variable shifts in SSE*, which
192  // we'd otherwise need. It has some funky side effects though:
193  // - This conversion will actually respect the FTZ (Flush To Zero)
194  // flag in MXCSR - if it's set, no half-float denormals will be
195  // generated. I'm honestly not sure whether this is good or
196  // bad. It's definitely interesting.
197  // - If the underlying HW doesn't support denormals (not an issue
198  // with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
199  // you will always get flush-to-zero behavior. This is bad,
200  // unless you're on a CPU where you don't care.
201  // - Denormals tend to be slow. FP32 denormals are rare in
202  // practice outside of things like recursive filters in DSP -
203  // not a typical half-float application. Whether FP16 denormals
204  // are rare in practice, I don't know. Whatever slow path your
205  // HW may or may not have for denormals, this may well hit it.
206  float fscale = floatbits(fint & round_mask) * floatbits(magic);
207  fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
208  int32_t fint2 = intbits(fscale) - round_mask;
209 
210  if (fint < f32infty)
211  o = fint2 >> 13; // Take the bits!
212 
213  return (o | (sign >> 16));
214 }
215 
216 float decode_fp16 (uint16_t h) {
217 
218  // https://gist.github.com/2144712
219  // Fabian "ryg" Giesen.
220 
221  const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
222 
223  int32_t o = ((int32_t)(h & 0x7fffu)) << 13; // exponent/mantissa bits
224  int32_t exp = shifted_exp & o; // just the exponent
225  o += (int32_t)(127 - 15) << 23; // exponent adjust
226 
227  int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
228  int32_t zerodenorm_val = intbits(
229  floatbits(o + (1u<<23)) - floatbits(113u << 23));
230  int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
231 
232  int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
233  return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
234 }
235 
236 #endif
237 
238 
239 
240 /*******************************************************************
241  * Quantizer: normalizes scalar vector components, then passes them
242  * through a codec
243  */
244 
245 
246 
247 struct Quantizer {
248  // encodes one vector. Assumes code is filled with 0s on input!
249  virtual void encode_vector(const float *x, uint8_t *code) const = 0;
250  virtual void decode_vector(const uint8_t *code, float *x) const = 0;
251 
252  virtual ~Quantizer() {}
253 };
254 
255 
256 template<class Codec, bool uniform, int SIMD>
257 struct QuantizerTemplate {};
258 
259 
260 template<class Codec>
261 struct QuantizerTemplate<Codec, true, 1>: Quantizer {
262  const size_t d;
263  const float vmin, vdiff;
264 
265  QuantizerTemplate(size_t d, const std::vector<float> &trained):
266  d(d), vmin(trained[0]), vdiff(trained[1])
267  {
268  }
269 
270  void encode_vector(const float* x, uint8_t* code) const override {
271  for (size_t i = 0; i < d; i++) {
272  float xi = (x[i] - vmin) / vdiff;
273  if (xi < 0)
274  xi = 0;
275  if (xi > 1.0)
276  xi = 1.0;
277  Codec::encode_component(xi, code, i);
278  }
279  }
280 
281  void decode_vector(const uint8_t* code, float* x) const override {
282  for (size_t i = 0; i < d; i++) {
283  float xi = Codec::decode_component(code, i);
284  x[i] = vmin + xi * vdiff;
285  }
286  }
287 
288  float reconstruct_component (const uint8_t * code, int i) const
289  {
290  float xi = Codec::decode_component (code, i);
291  return vmin + xi * vdiff;
292  }
293 
294 };
295 
296 
297 
298 #ifdef USE_AVX
299 
300 template<class Codec>
301 struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
302 
303  QuantizerTemplate (size_t d, const std::vector<float> &trained):
304  QuantizerTemplate<Codec, true, 1> (d, trained) {}
305 
306  __m256 reconstruct_8_components (const uint8_t * code, int i) const
307  {
308  __m256 xi = Codec::decode_8_components (code, i);
309  return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
310  }
311 
312 };
313 
314 #endif
315 
316 
317 
318 template<class Codec>
319 struct QuantizerTemplate<Codec, false, 1>: Quantizer {
320  const size_t d;
321  const float *vmin, *vdiff;
322 
323  QuantizerTemplate (size_t d, const std::vector<float> &trained):
324  d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
325 
326  void encode_vector(const float* x, uint8_t* code) const override {
327  for (size_t i = 0; i < d; i++) {
328  float xi = (x[i] - vmin[i]) / vdiff[i];
329  if (xi < 0)
330  xi = 0;
331  if (xi > 1.0)
332  xi = 1.0;
333  Codec::encode_component(xi, code, i);
334  }
335  }
336 
337  void decode_vector(const uint8_t* code, float* x) const override {
338  for (size_t i = 0; i < d; i++) {
339  float xi = Codec::decode_component(code, i);
340  x[i] = vmin[i] + xi * vdiff[i];
341  }
342  }
343 
344  float reconstruct_component (const uint8_t * code, int i) const
345  {
346  float xi = Codec::decode_component (code, i);
347  return vmin[i] + xi * vdiff[i];
348  }
349 
350 };
351 
352 
353 #ifdef USE_AVX
354 
355 template<class Codec>
356 struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
357 
358  QuantizerTemplate (size_t d, const std::vector<float> &trained):
359  QuantizerTemplate<Codec, false, 1> (d, trained) {}
360 
361  __m256 reconstruct_8_components (const uint8_t * code, int i) const
362  {
363  __m256 xi = Codec::decode_8_components (code, i);
364  return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
365  }
366 
367 
368 };
369 
370 #endif
371 
372 template<int SIMDWIDTH>
373 struct QuantizerFP16 {};
374 
375 
376 template<>
377 struct QuantizerFP16<1>: Quantizer {
378  const size_t d;
379 
380  QuantizerFP16(size_t d, const std::vector<float> & /* unused */):
381  d(d) {}
382 
383  void encode_vector(const float* x, uint8_t* code) const override {
384  for (size_t i = 0; i < d; i++) {
385  ((uint16_t*)code)[i] = encode_fp16(x[i]);
386  }
387  }
388 
389  void decode_vector(const uint8_t* code, float* x) const override {
390  for (size_t i = 0; i < d; i++) {
391  x[i] = decode_fp16(((uint16_t*)code)[i]);
392  }
393 
394  }
395 
396  float reconstruct_component (const uint8_t * code, int i) const
397  {
398  return decode_fp16(((uint16_t*)code)[i]);
399  }
400 
401 };
402 
403 #ifdef USE_AVX
404 
405 template<>
406 struct QuantizerFP16<8>: QuantizerFP16<1> {
407 
408  QuantizerFP16 (size_t d, const std::vector<float> &trained):
409  QuantizerFP16<1> (d, trained) {}
410 
411  __m256 reconstruct_8_components (const uint8_t * code, int i) const
412  {
413  __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
414  return _mm256_cvtph_ps (codei);
415  }
416 
417 };
418 
419 #endif
420 
421 
422 template<int SIMDWIDTH>
423 Quantizer *select_quantizer (
424  QuantizerType qtype,
425  size_t d, const std::vector<float> & trained)
426 {
427  switch(qtype) {
429  return new QuantizerTemplate<Codec8bit, false, SIMDWIDTH>(d, trained);
431  return new QuantizerTemplate<Codec4bit, false, SIMDWIDTH>(d, trained);
433  return new QuantizerTemplate<Codec8bit, true, SIMDWIDTH>(d, trained);
434  case ScalarQuantizer::QT_4bit_uniform:
435  return new QuantizerTemplate<Codec4bit, true, SIMDWIDTH>(d, trained);
436  case ScalarQuantizer::QT_fp16:
437  return new QuantizerFP16<SIMDWIDTH> (d, trained);
438  }
439  FAISS_THROW_MSG ("unknown qtype");
440 }
441 
442 
443 
444 Quantizer *select_quantizer (const ScalarQuantizer &sq)
445 {
446 #ifdef USE_AVX
447  if (sq.d % 8 == 0) {
448  return select_quantizer<8> (sq.qtype, sq.d, sq.trained);
449  } else
450 #endif
451  {
452  return select_quantizer<1> (sq.qtype, sq.d, sq.trained);
453  }
454 }
455 
456 
457 
458 
459 /*******************************************************************
460  * Quantizer range training
461  */
462 
463 static float sqr (float x) {
464  return x * x;
465 }
466 
467 
468 void train_Uniform(RangeStat rs, float rs_arg,
469  idx_t n, int k, const float *x,
470  std::vector<float> & trained)
471 {
472  trained.resize (2);
473  float & vmin = trained[0];
474  float & vmax = trained[1];
475 
476  if (rs == ScalarQuantizer::RS_minmax) {
477  vmin = HUGE_VAL; vmax = -HUGE_VAL;
478  for (size_t i = 0; i < n; i++) {
479  if (x[i] < vmin) vmin = x[i];
480  if (x[i] > vmax) vmax = x[i];
481  }
482  float vexp = (vmax - vmin) * rs_arg;
483  vmin -= vexp;
484  vmax += vexp;
485  } else if (rs == ScalarQuantizer::RS_meanstd) {
486  double sum = 0, sum2 = 0;
487  for (size_t i = 0; i < n; i++) {
488  sum += x[i];
489  sum2 += x[i] * x[i];
490  }
491  float mean = sum / n;
492  float var = sum2 / n - mean * mean;
493  float std = var <= 0 ? 1.0 : sqrt(var);
494 
495  vmin = mean - std * rs_arg ;
496  vmax = mean + std * rs_arg ;
497  } else if (rs == ScalarQuantizer::RS_quantiles) {
498  std::vector<float> x_copy(n);
499  memcpy(x_copy.data(), x, n * sizeof(*x));
500  // TODO just do a qucikselect
501  std::sort(x_copy.begin(), x_copy.end());
502  int o = int(rs_arg * n);
503  if (o < 0) o = 0;
504  if (o > n - o) o = n / 2;
505  vmin = x_copy[o];
506  vmax = x_copy[n - 1 - o];
507 
508  } else if (rs == ScalarQuantizer::RS_optim) {
509  float a, b;
510  float sx = 0;
511  {
512  vmin = HUGE_VAL, vmax = -HUGE_VAL;
513  for (size_t i = 0; i < n; i++) {
514  if (x[i] < vmin) vmin = x[i];
515  if (x[i] > vmax) vmax = x[i];
516  sx += x[i];
517  }
518  b = vmin;
519  a = (vmax - vmin) / (k - 1);
520  }
521  int verbose = false;
522  int niter = 2000;
523  float last_err = -1;
524  int iter_last_err = 0;
525  for (int it = 0; it < niter; it++) {
526  float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
527 
528  for (idx_t i = 0; i < n; i++) {
529  float xi = x[i];
530  float ni = floor ((xi - b) / a + 0.5);
531  if (ni < 0) ni = 0;
532  if (ni >= k) ni = k - 1;
533  err1 += sqr (xi - (ni * a + b));
534  sn += ni;
535  sn2 += ni * ni;
536  sxn += ni * xi;
537  }
538 
539  if (err1 == last_err) {
540  iter_last_err ++;
541  if (iter_last_err == 16) break;
542  } else {
543  last_err = err1;
544  iter_last_err = 0;
545  }
546 
547  float det = sqr (sn) - sn2 * n;
548 
549  b = (sn * sxn - sn2 * sx) / det;
550  a = (sn * sx - n * sxn) / det;
551  if (verbose) {
552  printf ("it %d, err1=%g \r", it, err1);
553  fflush(stdout);
554  }
555  }
556  if (verbose) printf("\n");
557 
558  vmin = b;
559  vmax = b + a * (k - 1);
560 
561  } else {
562  FAISS_THROW_MSG ("Invalid qtype");
563  }
564  vmax -= vmin;
565 }
566 
567 void train_NonUniform(RangeStat rs, float rs_arg,
568  idx_t n, int d, int k, const float *x,
569  std::vector<float> & trained)
570 {
571 
572  trained.resize (2 * d);
573  float * vmin = trained.data();
574  float * vmax = trained.data() + d;
575  if (rs == ScalarQuantizer::RS_minmax) {
576  memcpy (vmin, x, sizeof(*x) * d);
577  memcpy (vmax, x, sizeof(*x) * d);
578  for (size_t i = 1; i < n; i++) {
579  const float *xi = x + i * d;
580  for (size_t j = 0; j < d; j++) {
581  if (xi[j] < vmin[j]) vmin[j] = xi[j];
582  if (xi[j] > vmax[j]) vmax[j] = xi[j];
583  }
584  }
585  float *vdiff = vmax;
586  for (size_t j = 0; j < d; j++) {
587  float vexp = (vmax[j] - vmin[j]) * rs_arg;
588  vmin[j] -= vexp;
589  vmax[j] += vexp;
590  vdiff [j] = vmax[j] - vmin[j];
591  }
592  } else {
593  // transpose
594  std::vector<float> xt(n * d);
595  for (size_t i = 1; i < n; i++) {
596  const float *xi = x + i * d;
597  for (size_t j = 0; j < d; j++) {
598  xt[j * n + i] = xi[j];
599  }
600  }
601  std::vector<float> trained_d(2);
602 #pragma omp parallel for
603  for (size_t j = 0; j < d; j++) {
604  train_Uniform(rs, rs_arg,
605  n, k, xt.data() + j * n,
606  trained_d);
607  vmin[j] = trained_d[0];
608  vmax[j] = trained_d[1];
609  }
610  }
611 }
612 
613 
614 
615 /*******************************************************************
616  * Similarity: gets vector components and computes a similarity wrt. a
617  * query vector stored in the object. The data fields just encapsulate
618  * an accumulator.
619  */
620 
621 template<int SIMDWIDTH>
622 struct SimilarityL2 {};
623 
624 
625 template<>
626 struct SimilarityL2<1> {
627  const float *y, *yi;
628 
629  explicit SimilarityL2 (const float * y): y(y) {}
630 
631  /******* scalar accumulator *******/
632 
633  float accu;
634 
635  void begin () {
636  accu = 0;
637  yi = y;
638  }
639 
640  void add_component (float x) {
641  float tmp = *yi++ - x;
642  accu += tmp * tmp;
643  }
644 
645  void add_component_2 (float x1, float x2) {
646  float tmp = x1 - x2;
647  accu += tmp * tmp;
648  }
649 
650  float result () {
651  return accu;
652  }
653 };
654 
655 
656 #ifdef USE_AVX
657 template<>
658 struct SimilarityL2<8> {
659 
660  const float *y, *yi;
661 
662  explicit SimilarityL2 (const float * y): y(y) {}
663  __m256 accu8;
664 
665  void begin_8 () {
666  accu8 = _mm256_setzero_ps();
667  yi = y;
668  }
669 
670  void add_8_components (__m256 x) {
671  __m256 yiv = _mm256_loadu_ps (yi);
672  yi += 8;
673  __m256 tmp = yiv - x;
674  accu8 += tmp * tmp;
675  }
676 
677  void add_8_components_2 (__m256 x, __m256 y) {
678  __m256 tmp = y - x;
679  accu8 += tmp * tmp;
680  }
681 
682  float result_8 () {
683  __m256 sum = _mm256_hadd_ps(accu8, accu8);
684  __m256 sum2 = _mm256_hadd_ps(sum, sum);
685  // now add the 0th and 4th component
686  return
687  _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
688  _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
689  }
690 
691 };
692 
693 #endif
694 
695 
696 template<int SIMDWIDTH>
697 struct SimilarityIP {};
698 
699 
700 template<>
701 struct SimilarityIP<1> {
702  const float *y, *yi;
703 
704  float accu;
705 
706  explicit SimilarityIP (const float * y):
707  y (y) {}
708 
709  void begin () {
710  accu = 0;
711  yi = y;
712  }
713 
714  void add_component (float x) {
715  accu += *yi++ * x;
716  }
717 
718  void add_component_2 (float x1, float x2) {
719  accu += x1 * x2;
720  }
721 
722  float result () {
723  return accu;
724  }
725 };
726 
727 #ifdef USE_AVX
728 
729 template<>
730 struct SimilarityIP<8> {
731  const float *y, *yi;
732 
733  float accu;
734 
735  explicit SimilarityIP (const float * y):
736  y (y) {}
737 
738  __m256 accu8;
739 
740  void begin_8 () {
741  accu8 = _mm256_setzero_ps();
742  yi = y;
743  }
744 
745  void add_8_components (__m256 x) {
746  __m256 yiv = _mm256_loadu_ps (yi);
747  yi += 8;
748  accu8 += yiv * x;
749  }
750 
751  void add_8_components_2 (__m256 x1, __m256 x2) {
752  accu8 += x1 * x2;
753  }
754 
755  float result_8 () {
756  __m256 sum = _mm256_hadd_ps(accu8, accu8);
757  __m256 sum2 = _mm256_hadd_ps(sum, sum);
758  // now add the 0th and 4th component
759  return
760  _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
761  _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
762  }
763 };
764 #endif
765 
766 
767 /*******************************************************************
768  * DistanceComputer: combines a similarity and a quantizer to do
769  * code-to-vector or code-to-code comparisons
770  */
771 
772 template<class Quantizer, class Similarity, int SIMDWIDTH>
773 struct DCTemplate : ScalarQuantizer::DistanceComputer {};
774 
775 
776 template<class Quantizer, class Similarity>
777 struct DCTemplate<Quantizer, Similarity, 1> : DistanceComputer
778 {
779 
780  Quantizer quant;
781 
782  DCTemplate(size_t d, const std::vector<float> &trained):
783  quant(d, trained)
784  {}
785 
786  float compute_distance (const float *x,
787  const uint8_t *code) const override final
788  {
789  Similarity sim(x);
790  sim.begin();
791  for (size_t i = 0; i < quant.d; i ++) {
792  float xi = quant.reconstruct_component (code, i);
793  sim.add_component (xi);
794  }
795  return sim.result();
796  }
797 
798  float compute_code_distance (const uint8_t *code1,
799  const uint8_t *code2) const override final
800  {
801  Similarity sim(nullptr);
802  sim.begin ();
803  for (size_t i = 0; i < quant.d; i ++) {
804  float x1 = quant.reconstruct_component (code1, i);
805  float x2 = quant.reconstruct_component (code2, i);
806  sim.add_component_2 (x1, x2);
807  }
808  return sim.result ();
809  }
810 
811 };
812 
813 #ifdef USE_AVX
814 
815 template<class Quantizer, class Similarity>
816 struct DCTemplate<Quantizer, Similarity, 8> : DistanceComputer
817 {
818 
819  Quantizer quant;
820 
821  DCTemplate(size_t d, const std::vector<float> &trained):
822  quant(d, trained)
823  {}
824 
825  float compute_distance (const float *x,
826  const uint8_t *code) const override final
827  {
828  Similarity sim(x);
829  sim.begin_8();
830  for (size_t i = 0; i < quant.d; i += 8) {
831  __m256 xi = quant.reconstruct_8_components (code, i);
832  sim.add_8_components (xi);
833  }
834  return sim.result_8();
835  }
836 
837  float compute_code_distance (const uint8_t *code1,
838  const uint8_t *code2) const override final
839  {
840  Similarity sim(nullptr);
841  sim.begin_8 ();
842  for (size_t i = 0; i < quant.d; i += 8) {
843  __m256 x1 = quant.reconstruct_8_components (code1, i);
844  __m256 x2 = quant.reconstruct_8_components (code2, i);
845  sim.add_8_components_2 (x1, x2);
846  }
847  return sim.result_8 ();
848  }
849 
850 };
851 
852 
853 #endif
854 
855 
856 
857 template<class Sim, int SIMDWIDTH>
858 DistanceComputer *select_distance_computer (
859  QuantizerType qtype,
860  size_t d, const std::vector<float> & trained)
861 {
862  switch(qtype) {
864  return new DCTemplate<QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
865  Sim, SIMDWIDTH>(d, trained);
866 
867  case ScalarQuantizer::QT_4bit_uniform:
868  return new DCTemplate<QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
869  Sim, SIMDWIDTH>(d, trained);
870 
872  return new DCTemplate<QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
873  Sim, SIMDWIDTH>(d, trained);
874 
876  return new DCTemplate<QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
877  Sim, SIMDWIDTH>(d, trained);
878 
879  case ScalarQuantizer::QT_fp16:
880  return new DCTemplate<QuantizerFP16<SIMDWIDTH>,
881  Sim, SIMDWIDTH>(d, trained);
882  }
883  FAISS_THROW_MSG ("unknown qtype");
884  return nullptr;
885 }
886 
887 
888 
889 } // anonymous namespace
890 
891 
892 
893 /*******************************************************************
894  * ScalarQuantizer implementation
895  ********************************************************************/
896 
897 ScalarQuantizer::ScalarQuantizer
898  (size_t d, QuantizerType qtype):
899  qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
900 {
901  switch (qtype) {
902  case QT_8bit: case QT_8bit_uniform:
903  code_size = d;
904  break;
905  case QT_4bit: case QT_4bit_uniform:
906  code_size = (d + 1) / 2;
907  break;
908  case QT_fp16:
909  code_size = d * 2;
910  break;
911  }
912 
913 }
914 
915 ScalarQuantizer::ScalarQuantizer ():
916  qtype(QT_8bit),
917  rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
918 {}
919 
920 void ScalarQuantizer::train (size_t n, const float *x)
921 {
922  int bit_per_dim =
923  qtype == QT_4bit_uniform ? 4 :
924  qtype == QT_4bit ? 4 :
925  qtype == QT_8bit_uniform ? 8 :
926  qtype == QT_8bit ? 8 : -1;
927 
928  switch (qtype) {
929  case QT_4bit_uniform: case QT_8bit_uniform:
930  train_Uniform (rangestat, rangestat_arg,
931  n * d, 1 << bit_per_dim, x, trained);
932  break;
933  case QT_4bit: case QT_8bit:
934  train_NonUniform (rangestat, rangestat_arg,
935  n, d, 1 << bit_per_dim, x, trained);
936  break;
937  case QT_fp16:
938  // no training necessary
939  break;
940  }
941 }
942 
943 void ScalarQuantizer::compute_codes (const float * x,
944  uint8_t * codes,
945  size_t n) const
946 {
947  Quantizer *squant = select_quantizer (*this);
948  ScopeDeleter1<Quantizer> del(squant);
949  memset (codes, 0, code_size * n);
950 #pragma omp parallel for
951  for (size_t i = 0; i < n; i++)
952  squant->encode_vector (x + i * d, codes + i * code_size);
953 }
954 
955 void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
956 {
957  Quantizer *squant = select_quantizer (*this);
958  ScopeDeleter1<Quantizer> del(squant);
959 #pragma omp parallel for
960  for (size_t i = 0; i < n; i++)
961  squant->decode_vector (codes + i * code_size, x + i * d);
962 }
963 
964 
965 ScalarQuantizer::DistanceComputer *ScalarQuantizer::get_distance_computer (
966  MetricType metric)
967  const
968 {
969 #ifdef USE_AVX
970  if (d % 8 == 0) {
971  if (metric == METRIC_L2) {
972  return select_distance_computer<SimilarityL2<8>, 8>
973  (qtype, d, trained);
974  } else {
975  return select_distance_computer<SimilarityIP<8>, 8>
976  (qtype, d, trained);
977  }
978  } else
979 #endif
980  {
981  if (metric == METRIC_L2) {
982  return select_distance_computer<SimilarityL2<1>, 1>
983  (qtype, d, trained);
984  } else {
985  return select_distance_computer<SimilarityIP<1>, 1>
986  (qtype, d, trained);
987  }
988  }
989 }
990 
991 
992 /*******************************************************************
993  * IndexScalarQuantizer implementation
994  ********************************************************************/
995 
996 IndexScalarQuantizer::IndexScalarQuantizer
998  MetricType metric):
999  Index(d, metric),
1000  sq (d, qtype)
1001 {
1002  is_trained = false;
1003  code_size = sq.code_size;
1004 }
1005 
1006 
1007 IndexScalarQuantizer::IndexScalarQuantizer ():
1009 {}
1010 
1011 void IndexScalarQuantizer::train(idx_t n, const float* x)
1012 {
1013  sq.train(n, x);
1014  is_trained = true;
1015 }
1016 
1017 void IndexScalarQuantizer::add(idx_t n, const float* x)
1018 {
1019  FAISS_THROW_IF_NOT (is_trained);
1020  codes.resize ((n + ntotal) * code_size);
1021  sq.compute_codes (x, &codes[ntotal * code_size], n);
1022  ntotal += n;
1023 }
1024 
1025 
1026 
1027 namespace {
1028 
1029 template<class C>
1030 void search_flat_scalar_quantizer(
1031  const IndexScalarQuantizer & index,
1032  idx_t n,
1033  const float* x,
1034  idx_t k,
1035  float* distances,
1036  idx_t* labels)
1037 {
1038  size_t code_size = index.code_size;
1039  size_t d = index.d;
1040 
1041 #pragma omp parallel
1042  {
1043  DistanceComputer *dc =
1044  index.sq.get_distance_computer(index.metric_type);
1046 
1047 #pragma omp for
1048  for (size_t i = 0; i < n; i++) {
1049  idx_t *idxi = labels + i * k;
1050  float *simi = distances + i * k;
1051  heap_heapify<C> (k, simi, idxi);
1052 
1053  const float *xi = x + i * d;
1054  const uint8_t *ci = index.codes.data ();
1055 
1056  for (size_t j = 0; j < index.ntotal; j++) {
1057  float accu = dc->compute_distance(xi, ci);
1058  if (C::cmp (simi [0], accu)) {
1059  heap_pop<C> (k, simi, idxi);
1060  heap_push<C> (k, simi, idxi, accu, j);
1061  }
1062  ci += code_size;
1063  }
1064  heap_reorder<C> (k, simi, idxi);
1065  }
1066  }
1067 
1068 };
1069 
1070 }
1071 
1073  idx_t n,
1074  const float* x,
1075  idx_t k,
1076  float* distances,
1077  idx_t* labels) const
1078 {
1079  FAISS_THROW_IF_NOT (is_trained);
1080  if (metric_type == METRIC_L2) {
1081  search_flat_scalar_quantizer<CMax<float, idx_t> > (*this, n, x, k, distances, labels);
1082  } else {
1083  search_flat_scalar_quantizer<CMin<float, idx_t> > (*this, n, x, k, distances, labels);
1084  }
1085 }
1086 
1088 {
1089  codes.clear();
1090  ntotal = 0;
1091 }
1092 
1094  idx_t i0, idx_t ni, float* recons) const
1095 {
1096  Quantizer *squant = select_quantizer (sq);
1097  ScopeDeleter1<Quantizer> del (squant);
1098  for (size_t i = 0; i < ni; i++) {
1099  squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
1100  }
1101 }
1102 
1103 void IndexScalarQuantizer::reconstruct(idx_t key, float* recons) const
1104 {
1105  reconstruct_n(key, 1, recons);
1106 }
1107 
1108 
1109 /*******************************************************************
1110  * IndexIVFScalarQuantizer implementation
1111  ********************************************************************/
1112 
1113 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
1114  (Index *quantizer, size_t d, size_t nlist,
1115  QuantizerType qtype, MetricType metric):
1116  IndexIVF (quantizer, d, nlist, 0, metric),
1117  sq (d, qtype)
1118 {
1119  code_size = sq.code_size;
1120  // was not known at construction time
1121  invlists->code_size = code_size;
1122  is_trained = false;
1123 }
1124 
1125 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
1126  IndexIVF ()
1127 {}
1128 
1129 void IndexIVFScalarQuantizer::train_residual (idx_t n, const float *x)
1130 {
1131  const float * x_in = x;
1132 
1133  // 100k points more than enough
1134  x = fvecs_maybe_subsample (
1135  d, (size_t*)&n, 100000,
1136  x, verbose, 1234);
1137 
1138  ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
1139 
1140  long * idx = new long [n];
1141  ScopeDeleter<long> del (idx);
1142  quantizer->assign (n, x, idx);
1143  float *residuals = new float [n * d];
1144  ScopeDeleter<float> del2 (residuals);
1145 
1146 #pragma omp parallel for
1147  for (idx_t i = 0; i < n; i++) {
1148  quantizer->compute_residual (x + i * d, residuals + i * d, idx[i]);
1149  }
1150 
1151  sq.train (n, residuals);
1152 
1153 }
1154 
1155 void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
1156  const idx_t *list_nos,
1157  uint8_t * codes) const
1158 {
1159  Quantizer *squant = select_quantizer (sq);
1160  ScopeDeleter1<Quantizer> del (squant);
1161  memset(codes, 0, code_size * n);
1162 
1163 #pragma omp parallel
1164  {
1165  std::vector<float> residual (d);
1166 
1167  // each thread takes care of a subset of lists
1168 #pragma omp for
1169  for (size_t i = 0; i < n; i++) {
1170  long list_no = list_nos [i];
1171  if (list_no >= 0) {
1172 
1174  x + i * d, residual.data(), list_no);
1175 
1176  squant->encode_vector (residual.data(),
1177  codes + i * code_size);
1178  } else {
1179  memset (codes + i * code_size, 0, code_size);
1180  }
1181  }
1182  }
1183 }
1184 
1185 
1186 
1188  (idx_t n, const float * x, const long *xids)
1189 {
1190  FAISS_THROW_IF_NOT (is_trained);
1191  long * idx = new long [n];
1192  ScopeDeleter<long> del (idx);
1193  quantizer->assign (n, x, idx);
1194  size_t nadd = 0;
1195  Quantizer *squant = select_quantizer (sq);
1196  ScopeDeleter1<Quantizer> del2 (squant);
1197 
1198 #pragma omp parallel reduction(+: nadd)
1199  {
1200  std::vector<float> residual (d);
1201  std::vector<uint8_t> one_code (code_size);
1202  int nt = omp_get_num_threads();
1203  int rank = omp_get_thread_num();
1204 
1205  // each thread takes care of a subset of lists
1206  for (size_t i = 0; i < n; i++) {
1207  long list_no = idx [i];
1208  if (list_no >= 0 && list_no % nt == rank) {
1209  long id = xids ? xids[i] : ntotal + i;
1210 
1211  quantizer->compute_residual (
1212  x + i * d, residual.data(), list_no);
1213 
1214  memset (one_code.data(), 0, code_size);
1215  squant->encode_vector (residual.data(), one_code.data());
1216 
1217  invlists->add_entry (list_no, id, one_code.data());
1218 
1219  nadd++;
1220 
1221  }
1222  }
1223  }
1224  ntotal += n;
1225 }
1226 
1227 namespace {
1228 
1229 
1230 template<bool store_pairs, class Quantizer, int SIMDWIDTH>
1231 struct IVFSQScannerIP: InvertedListScanner {
1232 
1233  DCTemplate<Quantizer, SimilarityIP<SIMDWIDTH>, SIMDWIDTH> dc;
1234 
1235  size_t code_size;
1236 
1237  IVFSQScannerIP(int d, const std::vector<float> & trained,
1238  size_t code_size): dc(d, trained), code_size(code_size)
1239  {}
1240 
1241 
1242  const float *x;
1243  void set_query (const float *query) override {
1244  this->x = query;
1245  }
1246 
1247  idx_t list_no;
1248  float accu0;
1249 
1250  void set_list (idx_t list_no, float coarse_dis) override {
1251  this->list_no = list_no;
1252  accu0 = coarse_dis;
1253  }
1254 
1255  float distance_to_code (const uint8_t *code) const override {
1256  return accu0 + dc.compute_distance(x, code);
1257  }
1258 
1259  size_t scan_codes (size_t list_size,
1260  const uint8_t *codes,
1261  const idx_t *ids,
1262  float *simi, idx_t *idxi,
1263  size_t k) const override
1264  {
1265  size_t nup = 0;
1266 
1267  for (size_t j = 0; j < list_size; j++) {
1268 
1269  float accu = accu0 + dc.compute_distance(x, codes);
1270 
1271  if (accu > simi [0]) {
1272  minheap_pop (k, simi, idxi);
1273  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1274  minheap_push (k, simi, idxi, accu, id);
1275  nup++;
1276  }
1277  codes += code_size;
1278  }
1279  return nup;
1280  }
1281 
1282 };
1283 
1284 
1285 template<bool store_pairs, class Quantizer, int SIMDWIDTH>
1286 struct IVFSQScannerL2: InvertedListScanner {
1287 
1288  DCTemplate<Quantizer, SimilarityL2<SIMDWIDTH>, SIMDWIDTH> dc;
1289 
1290  size_t code_size;
1291  const Index *quantizer;
1292 
1293  std::vector<float> tmp;
1294 
1295  IVFSQScannerL2(int d, const std::vector<float> & trained,
1296  size_t code_size,
1297  const Index *quantizer):
1298  dc(d, trained), code_size(code_size), quantizer(quantizer),
1299  tmp (d)
1300  {
1301  }
1302 
1303  const float *x;
1304  void set_query (const float *query) override {
1305  x = query;
1306  }
1307 
1308  idx_t list_no;
1309 
1310  void set_list (idx_t list_no, float coarse_dis) override {
1311  this->list_no = list_no;
1312  // shift of x_in wrt centroid
1313  quantizer->compute_residual (x, tmp.data(), list_no);
1314  }
1315 
1316  float distance_to_code (const uint8_t *code) const override {
1317  return dc.compute_distance (tmp.data(), code);
1318  }
1319 
1320  size_t scan_codes (size_t list_size,
1321  const uint8_t *codes,
1322  const idx_t *ids,
1323  float *simi, idx_t *idxi,
1324  size_t k) const override
1325  {
1326  size_t nup = 0;
1327  for (size_t j = 0; j < list_size; j++) {
1328 
1329  float dis = dc.compute_distance (tmp.data(), codes);
1330 
1331  if (dis < simi [0]) {
1332  maxheap_pop (k, simi, idxi);
1333  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1334  maxheap_push (k, simi, idxi, dis, id);
1335  nup++;
1336  }
1337  codes += code_size;
1338  }
1339  return nup;
1340  }
1341 
1342 };
1343 
1344 template<class Quantizer, int SIMDWIDTH>
1345 InvertedListScanner* sel2_InvertedListScanner
1346  (const IndexIVFScalarQuantizer *ivf, bool store_pairs)
1347 {
1348  if (ivf->metric_type == METRIC_L2) {
1349  if (store_pairs) {
1350  return new IVFSQScannerL2<true, Quantizer, SIMDWIDTH>
1351  (ivf->d, ivf->sq.trained, ivf->code_size, ivf->quantizer);
1352  } else {
1353  return new IVFSQScannerL2<false, Quantizer, SIMDWIDTH>
1354  (ivf->d, ivf->sq.trained, ivf->code_size, ivf->quantizer);
1355  }
1356  } else {
1357  if (store_pairs) {
1358  return new IVFSQScannerIP<true, Quantizer, SIMDWIDTH>
1359  (ivf->d, ivf->sq.trained, ivf->code_size);
1360  } else {
1361  return new IVFSQScannerIP<false, Quantizer, SIMDWIDTH>
1362  (ivf->d, ivf->sq.trained, ivf->code_size);
1363  }
1364  }
1365 }
1366 
1367 
1368 template<int SIMDWIDTH>
1369 InvertedListScanner* select_InvertedListScanner
1370  (const IndexIVFScalarQuantizer *ivf, bool store_pairs)
1371 {
1372  switch(ivf->sq.qtype) {
1374  return sel2_InvertedListScanner
1375  <QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
1376  SIMDWIDTH>(ivf, store_pairs);
1377  case ScalarQuantizer::QT_4bit_uniform:
1378  return sel2_InvertedListScanner
1379  <QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
1380  SIMDWIDTH>(ivf, store_pairs);
1382  return sel2_InvertedListScanner
1383  <QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
1384  SIMDWIDTH>(ivf, store_pairs);
1386  return sel2_InvertedListScanner
1387  <QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
1388  SIMDWIDTH>(ivf, store_pairs);
1389  case ScalarQuantizer::QT_fp16:
1390  return sel2_InvertedListScanner<QuantizerFP16<SIMDWIDTH>,
1391  SIMDWIDTH>(ivf, store_pairs);
1392  }
1393  FAISS_THROW_MSG ("unknown qtype");
1394  return nullptr;
1395 }
1396 
1397 
1398 } // anonymous namespace
1399 
1400 
1401 
1402 
1404  (bool store_pairs) const
1405 {
1406 #ifdef USE_AVX
1407  if (d % 8 == 0) {
1408  return select_InvertedListScanner<8> (this, store_pairs);
1409  } else
1410 #endif
1411  {
1412  return select_InvertedListScanner<1> (this, store_pairs);
1413  }
1414 
1415 }
1416 
1417 
1419  long offset,
1420  float* recons) const
1421 {
1422  std::vector<float> centroid(d);
1423  quantizer->reconstruct (list_no, centroid.data());
1424 
1425  const uint8_t* code = invlists->get_single_code (list_no, offset);
1426  sq.decode (code, recons, 1);
1427  for (int i = 0; i < d; ++i) {
1428  recons[i] += centroid[i];
1429  }
1430 }
1431 
1432 } // namespace faiss
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
size_t code_size
bytes per vector
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void train_residual(idx_t n, const float *x) override
alternate optimization of reconstruction error
same, shared range for all dimensions
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void add(idx_t n, const float *x) override
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1528
void assign(idx_t n, const float *x, idx_t *labels, idx_t k=1)
Definition: Index.cpp:35
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
int d
vector dimension
Definition: Index.h:66
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
std::vector< uint8_t > codes
Codes. Size ntotal * pq.code_size.
ScalarQuantizer sq
Used to encode the vectors.
long idx_t
all indices are this type
Definition: Index.h:64
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
[mean - std * rs, mean + std * rs]
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:93
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void reconstruct(idx_t key, float *recons) const override
[min - rs*(max-min), max + rs*(max-min)]
std::vector< float > trained
trained values (including the range)
Index * quantizer
quantizer that maps vectors to inverted lists
Definition: IndexIVF.h:33
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void compute_residual(const float *x, float *residual, idx_t key) const
Definition: Index.cpp:87
virtual void reconstruct(idx_t key, float *recons) const
Definition: Index.cpp:55
void train(idx_t n, const float *x) override
size_t d
dimension of input vectors
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:96
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:45