Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
/data/users/hoss/faiss/IndexScalarQuantizer.cpp
1 /**
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * This source code is licensed under the MIT license found in the
5  * LICENSE file in the root directory of this source tree.
6  */
7 
8 // -*- c++ -*-
9 
10 #include "IndexScalarQuantizer.h"
11 
12 #include <cstdio>
13 #include <algorithm>
14 
15 #include <omp.h>
16 
17 #ifdef __SSE__
18 #include <immintrin.h>
19 #endif
20 
21 #include "utils.h"
22 #include "FaissAssert.h"
23 #include "AuxIndexStructures.h"
24 
25 namespace faiss {
26 
27 /*******************************************************************
28  * ScalarQuantizer implementation
29  *
30  * The main source of complexity is to support combinations of 4
31  * variants without incurring runtime tests or virtual function calls:
32  *
33  * - 4 / 8 bits per code component
34  * - uniform / non-uniform
35  * - IP / L2 distance search
36  * - scalar / AVX distance computation
37  *
38  * The appropriate Quantizer object is returned via select_quantizer
39  * that hides the template mess.
40  ********************************************************************/
41 
42 #ifdef __AVX__
43 #define USE_AVX
44 #endif
45 
46 
48 
49  const float *q;
50  const uint8_t *codes;
51  size_t code_size;
52 
53  SQDistanceComputer (): q(nullptr), codes (nullptr), code_size (0)
54  {}
55 
56 };
57 
58 
59 namespace {
60 
61 typedef Index::idx_t idx_t;
62 typedef ScalarQuantizer::QuantizerType QuantizerType;
63 typedef ScalarQuantizer::RangeStat RangeStat;
64 
65 
66 
67 /*******************************************************************
68  * Codec: converts between values in [0, 1] and an index in a code
69  * array. The "i" parameter is the vector component index (not byte
70  * index).
71  */
72 
73 struct Codec8bit {
74 
75  static void encode_component (float x, uint8_t *code, int i) {
76  code[i] = (int)(255 * x);
77  }
78 
79  static float decode_component (const uint8_t *code, int i) {
80  return (code[i] + 0.5f) / 255.0f;
81  }
82 
83 #ifdef USE_AVX
84  static __m256 decode_8_components (const uint8_t *code, int i) {
85  uint64_t c8 = *(uint64_t*)(code + i);
86  __m128i c4lo = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8));
87  __m128i c4hi = _mm_cvtepu8_epi32 (_mm_set1_epi32(c8 >> 32));
88  // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
89  __m256i i8 = _mm256_castsi128_si256 (c4lo);
90  i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
91  __m256 f8 = _mm256_cvtepi32_ps (i8);
92  __m256 half = _mm256_set1_ps (0.5f);
93  f8 += half;
94  __m256 one_255 = _mm256_set1_ps (1.f / 255.f);
95  return f8 * one_255;
96  }
97 #endif
98 };
99 
100 
101 struct Codec4bit {
102 
103  static void encode_component (float x, uint8_t *code, int i) {
104  code [i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
105  }
106 
107  static float decode_component (const uint8_t *code, int i) {
108  return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
109  }
110 
111 
112 #ifdef USE_AVX
113  static __m256 decode_8_components (const uint8_t *code, int i) {
114  uint32_t c4 = *(uint32_t*)(code + (i >> 1));
115  uint32_t mask = 0x0f0f0f0f;
116  uint32_t c4ev = c4 & mask;
117  uint32_t c4od = (c4 >> 4) & mask;
118 
119  // the 8 lower bytes of c8 contain the values
120  __m128i c8 = _mm_unpacklo_epi8 (_mm_set1_epi32(c4ev),
121  _mm_set1_epi32(c4od));
122  __m128i c4lo = _mm_cvtepu8_epi32 (c8);
123  __m128i c4hi = _mm_cvtepu8_epi32 (_mm_srli_si128(c8, 4));
124  __m256i i8 = _mm256_castsi128_si256 (c4lo);
125  i8 = _mm256_insertf128_si256 (i8, c4hi, 1);
126  __m256 f8 = _mm256_cvtepi32_ps (i8);
127  __m256 half = _mm256_set1_ps (0.5f);
128  f8 += half;
129  __m256 one_255 = _mm256_set1_ps (1.f / 15.f);
130  return f8 * one_255;
131  }
132 #endif
133 };
134 
135 struct Codec6bit {
136 
137  static void encode_component (float x, uint8_t *code, int i) {
138  int bits = (int)(x * 63.0);
139  code += (i >> 2) * 3;
140  switch(i & 3) {
141  case 0:
142  code[0] |= bits;
143  break;
144  case 1:
145  code[0] |= bits << 6;
146  code[1] |= bits >> 2;
147  break;
148  case 2:
149  code[1] |= bits << 4;
150  code[2] |= bits >> 4;
151  break;
152  case 3:
153  code[2] |= bits << 2;
154  break;
155  }
156  }
157 
158  static float decode_component (const uint8_t *code, int i) {
159  uint8_t bits;
160  code += (i >> 2) * 3;
161  switch(i & 3) {
162  case 0:
163  bits = code[0] & 0x3f;
164  break;
165  case 1:
166  bits = code[0] >> 6;
167  bits |= (code[1] & 0xf) << 2;
168  break;
169  case 2:
170  bits = code[1] >> 4;
171  bits |= (code[2] & 3) << 4;
172  break;
173  case 3:
174  bits = code[2] >> 2;
175  break;
176  }
177  return (bits + 0.5f) / 63.0f;
178  }
179 
180 #ifdef USE_AVX
181  static __m256 decode_8_components (const uint8_t *code, int i) {
182  return _mm256_set_ps
183  (decode_component(code, i + 7),
184  decode_component(code, i + 6),
185  decode_component(code, i + 5),
186  decode_component(code, i + 4),
187  decode_component(code, i + 3),
188  decode_component(code, i + 2),
189  decode_component(code, i + 1),
190  decode_component(code, i + 0));
191  }
192 #endif
193 };
194 
195 
196 
197 #ifdef USE_AVX
198 
199 
200 uint16_t encode_fp16 (float x) {
201  __m128 xf = _mm_set1_ps (x);
202  __m128i xi = _mm_cvtps_ph (
203  xf, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
204  return _mm_cvtsi128_si32 (xi) & 0xffff;
205 }
206 
207 
208 float decode_fp16 (uint16_t x) {
209  __m128i xi = _mm_set1_epi16 (x);
210  __m128 xf = _mm_cvtph_ps (xi);
211  return _mm_cvtss_f32 (xf);
212 }
213 
214 #else
215 
216 // non-intrinsic FP16 <-> FP32 code adapted from
217 // https://github.com/ispc/ispc/blob/master/stdlib.ispc
218 
219 float floatbits (uint32_t x) {
220  void *xptr = &x;
221  return *(float*)xptr;
222 }
223 
224 uint32_t intbits (float f) {
225  void *fptr = &f;
226  return *(uint32_t*)fptr;
227 }
228 
229 
230 uint16_t encode_fp16 (float f) {
231 
232  // via Fabian "ryg" Giesen.
233  // https://gist.github.com/2156668
234  uint32_t sign_mask = 0x80000000u;
235  int32_t o;
236 
237  uint32_t fint = intbits(f);
238  uint32_t sign = fint & sign_mask;
239  fint ^= sign;
240 
241  // NOTE all the integer compares in this function can be safely
242  // compiled into signed compares since all operands are below
243  // 0x80000000. Important if you want fast straight SSE2 code (since
244  // there's no unsigned PCMPGTD).
245 
246  // Inf or NaN (all exponent bits set)
247  // NaN->qNaN and Inf->Inf
248  // unconditional assignment here, will override with right value for
249  // the regular case below.
250  uint32_t f32infty = 255u << 23;
251  o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
252 
253  // (De)normalized number or zero
254  // update fint unconditionally to save the blending; we don't need it
255  // anymore for the Inf/NaN case anyway.
256 
257  const uint32_t round_mask = ~0xfffu;
258  const uint32_t magic = 15u << 23;
259 
260  // Shift exponent down, denormalize if necessary.
261  // NOTE This represents half-float denormals using single
262  // precision denormals. The main reason to do this is that
263  // there's no shift with per-lane variable shifts in SSE*, which
264  // we'd otherwise need. It has some funky side effects though:
265  // - This conversion will actually respect the FTZ (Flush To Zero)
266  // flag in MXCSR - if it's set, no half-float denormals will be
267  // generated. I'm honestly not sure whether this is good or
268  // bad. It's definitely interesting.
269  // - If the underlying HW doesn't support denormals (not an issue
270  // with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
271  // you will always get flush-to-zero behavior. This is bad,
272  // unless you're on a CPU where you don't care.
273  // - Denormals tend to be slow. FP32 denormals are rare in
274  // practice outside of things like recursive filters in DSP -
275  // not a typical half-float application. Whether FP16 denormals
276  // are rare in practice, I don't know. Whatever slow path your
277  // HW may or may not have for denormals, this may well hit it.
278  float fscale = floatbits(fint & round_mask) * floatbits(magic);
279  fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
280  int32_t fint2 = intbits(fscale) - round_mask;
281 
282  if (fint < f32infty)
283  o = fint2 >> 13; // Take the bits!
284 
285  return (o | (sign >> 16));
286 }
287 
288 float decode_fp16 (uint16_t h) {
289 
290  // https://gist.github.com/2144712
291  // Fabian "ryg" Giesen.
292 
293  const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
294 
295  int32_t o = ((int32_t)(h & 0x7fffu)) << 13; // exponent/mantissa bits
296  int32_t exp = shifted_exp & o; // just the exponent
297  o += (int32_t)(127 - 15) << 23; // exponent adjust
298 
299  int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
300  int32_t zerodenorm_val = intbits(
301  floatbits(o + (1u<<23)) - floatbits(113u << 23));
302  int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
303 
304  int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
305  return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
306 }
307 
308 #endif
309 
310 
311 
312 /*******************************************************************
313  * Quantizer: normalizes scalar vector components, then passes them
314  * through a codec
315  *******************************************************************/
316 
317 
318 
319 struct Quantizer {
320  // encodes one vector. Assumes code is filled with 0s on input!
321  virtual void encode_vector(const float *x, uint8_t *code) const = 0;
322  virtual void decode_vector(const uint8_t *code, float *x) const = 0;
323 
324  virtual ~Quantizer() {}
325 };
326 
327 
328 template<class Codec, bool uniform, int SIMD>
329 struct QuantizerTemplate {};
330 
331 
332 template<class Codec>
333 struct QuantizerTemplate<Codec, true, 1>: Quantizer {
334  const size_t d;
335  const float vmin, vdiff;
336 
337  QuantizerTemplate(size_t d, const std::vector<float> &trained):
338  d(d), vmin(trained[0]), vdiff(trained[1])
339  {
340  }
341 
342  void encode_vector(const float* x, uint8_t* code) const final {
343  for (size_t i = 0; i < d; i++) {
344  float xi = (x[i] - vmin) / vdiff;
345  if (xi < 0) {
346  xi = 0;
347  }
348  if (xi > 1.0) {
349  xi = 1.0;
350  }
351  Codec::encode_component(xi, code, i);
352  }
353  }
354 
355  void decode_vector(const uint8_t* code, float* x) const final {
356  for (size_t i = 0; i < d; i++) {
357  float xi = Codec::decode_component(code, i);
358  x[i] = vmin + xi * vdiff;
359  }
360  }
361 
362  float reconstruct_component (const uint8_t * code, int i) const
363  {
364  float xi = Codec::decode_component (code, i);
365  return vmin + xi * vdiff;
366  }
367 
368 };
369 
370 
371 
372 #ifdef USE_AVX
373 
374 template<class Codec>
375 struct QuantizerTemplate<Codec, true, 8>: QuantizerTemplate<Codec, true, 1> {
376 
377  QuantizerTemplate (size_t d, const std::vector<float> &trained):
378  QuantizerTemplate<Codec, true, 1> (d, trained) {}
379 
380  __m256 reconstruct_8_components (const uint8_t * code, int i) const
381  {
382  __m256 xi = Codec::decode_8_components (code, i);
383  return _mm256_set1_ps(this->vmin) + xi * _mm256_set1_ps (this->vdiff);
384  }
385 
386 };
387 
388 #endif
389 
390 
391 
392 template<class Codec>
393 struct QuantizerTemplate<Codec, false, 1>: Quantizer {
394  const size_t d;
395  const float *vmin, *vdiff;
396 
397  QuantizerTemplate (size_t d, const std::vector<float> &trained):
398  d(d), vmin(trained.data()), vdiff(trained.data() + d) {}
399 
400  void encode_vector(const float* x, uint8_t* code) const final {
401  for (size_t i = 0; i < d; i++) {
402  float xi = (x[i] - vmin[i]) / vdiff[i];
403  if (xi < 0)
404  xi = 0;
405  if (xi > 1.0)
406  xi = 1.0;
407  Codec::encode_component(xi, code, i);
408  }
409  }
410 
411  void decode_vector(const uint8_t* code, float* x) const final {
412  for (size_t i = 0; i < d; i++) {
413  float xi = Codec::decode_component(code, i);
414  x[i] = vmin[i] + xi * vdiff[i];
415  }
416  }
417 
418  float reconstruct_component (const uint8_t * code, int i) const
419  {
420  float xi = Codec::decode_component (code, i);
421  return vmin[i] + xi * vdiff[i];
422  }
423 
424 };
425 
426 
427 #ifdef USE_AVX
428 
429 template<class Codec>
430 struct QuantizerTemplate<Codec, false, 8>: QuantizerTemplate<Codec, false, 1> {
431 
432  QuantizerTemplate (size_t d, const std::vector<float> &trained):
433  QuantizerTemplate<Codec, false, 1> (d, trained) {}
434 
435  __m256 reconstruct_8_components (const uint8_t * code, int i) const
436  {
437  __m256 xi = Codec::decode_8_components (code, i);
438  return _mm256_loadu_ps (this->vmin + i) + xi * _mm256_loadu_ps (this->vdiff + i);
439  }
440 
441 
442 };
443 
444 #endif
445 
446 /*******************************************************************
447  * FP16 quantizer
448  *******************************************************************/
449 
450 template<int SIMDWIDTH>
451 struct QuantizerFP16 {};
452 
453 template<>
454 struct QuantizerFP16<1>: Quantizer {
455  const size_t d;
456 
457  QuantizerFP16(size_t d, const std::vector<float> & /* unused */):
458  d(d) {}
459 
460  void encode_vector(const float* x, uint8_t* code) const final {
461  for (size_t i = 0; i < d; i++) {
462  ((uint16_t*)code)[i] = encode_fp16(x[i]);
463  }
464  }
465 
466  void decode_vector(const uint8_t* code, float* x) const final {
467  for (size_t i = 0; i < d; i++) {
468  x[i] = decode_fp16(((uint16_t*)code)[i]);
469  }
470  }
471 
472  float reconstruct_component (const uint8_t * code, int i) const
473  {
474  return decode_fp16(((uint16_t*)code)[i]);
475  }
476 
477 };
478 
479 #ifdef USE_AVX
480 
481 template<>
482 struct QuantizerFP16<8>: QuantizerFP16<1> {
483 
484  QuantizerFP16 (size_t d, const std::vector<float> &trained):
485  QuantizerFP16<1> (d, trained) {}
486 
487  __m256 reconstruct_8_components (const uint8_t * code, int i) const
488  {
489  __m128i codei = _mm_loadu_si128 ((const __m128i*)(code + 2 * i));
490  return _mm256_cvtph_ps (codei);
491  }
492 
493 };
494 
495 #endif
496 
497 /*******************************************************************
498  * 8bit_direct quantizer
499  *******************************************************************/
500 
501 template<int SIMDWIDTH>
502 struct Quantizer8bitDirect {};
503 
504 template<>
505 struct Quantizer8bitDirect<1>: Quantizer {
506  const size_t d;
507 
508  Quantizer8bitDirect(size_t d, const std::vector<float> & /* unused */):
509  d(d) {}
510 
511 
512  void encode_vector(const float* x, uint8_t* code) const final {
513  for (size_t i = 0; i < d; i++) {
514  code[i] = (uint8_t)x[i];
515  }
516  }
517 
518  void decode_vector(const uint8_t* code, float* x) const final {
519  for (size_t i = 0; i < d; i++) {
520  x[i] = code[i];
521  }
522  }
523 
524  float reconstruct_component (const uint8_t * code, int i) const
525  {
526  return code[i];
527  }
528 
529 };
530 
531 #ifdef USE_AVX
532 
533 template<>
534 struct Quantizer8bitDirect<8>: Quantizer8bitDirect<1> {
535 
536  Quantizer8bitDirect (size_t d, const std::vector<float> &trained):
537  Quantizer8bitDirect<1> (d, trained) {}
538 
539  __m256 reconstruct_8_components (const uint8_t * code, int i) const
540  {
541  __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
542  __m256i y8 = _mm256_cvtepu8_epi32 (x8); // 8 * int32
543  return _mm256_cvtepi32_ps (y8); // 8 * float32
544  }
545 
546 };
547 
548 #endif
549 
550 
551 template<int SIMDWIDTH>
552 Quantizer *select_quantizer (
553  QuantizerType qtype,
554  size_t d, const std::vector<float> & trained)
555 {
556  switch(qtype) {
558  return new QuantizerTemplate<Codec8bit, false, SIMDWIDTH>(d, trained);
560  return new QuantizerTemplate<Codec6bit, false, SIMDWIDTH>(d, trained);
562  return new QuantizerTemplate<Codec4bit, false, SIMDWIDTH>(d, trained);
564  return new QuantizerTemplate<Codec8bit, true, SIMDWIDTH>(d, trained);
565  case ScalarQuantizer::QT_4bit_uniform:
566  return new QuantizerTemplate<Codec4bit, true, SIMDWIDTH>(d, trained);
567  case ScalarQuantizer::QT_fp16:
568  return new QuantizerFP16<SIMDWIDTH> (d, trained);
569  case ScalarQuantizer::QT_8bit_direct:
570  return new Quantizer8bitDirect<SIMDWIDTH> (d, trained);
571  }
572  FAISS_THROW_MSG ("unknown qtype");
573 }
574 
575 
576 
577 Quantizer *select_quantizer (const ScalarQuantizer &sq)
578 {
579 #ifdef USE_AVX
580  if (sq.d % 8 == 0) {
581  return select_quantizer<8> (sq.qtype, sq.d, sq.trained);
582  } else
583 #endif
584  {
585  return select_quantizer<1> (sq.qtype, sq.d, sq.trained);
586  }
587 }
588 
589 
590 
591 
592 /*******************************************************************
593  * Quantizer range training
594  */
595 
596 static float sqr (float x) {
597  return x * x;
598 }
599 
600 
601 void train_Uniform(RangeStat rs, float rs_arg,
602  idx_t n, int k, const float *x,
603  std::vector<float> & trained)
604 {
605  trained.resize (2);
606  float & vmin = trained[0];
607  float & vmax = trained[1];
608 
609  if (rs == ScalarQuantizer::RS_minmax) {
610  vmin = HUGE_VAL; vmax = -HUGE_VAL;
611  for (size_t i = 0; i < n; i++) {
612  if (x[i] < vmin) vmin = x[i];
613  if (x[i] > vmax) vmax = x[i];
614  }
615  float vexp = (vmax - vmin) * rs_arg;
616  vmin -= vexp;
617  vmax += vexp;
618  } else if (rs == ScalarQuantizer::RS_meanstd) {
619  double sum = 0, sum2 = 0;
620  for (size_t i = 0; i < n; i++) {
621  sum += x[i];
622  sum2 += x[i] * x[i];
623  }
624  float mean = sum / n;
625  float var = sum2 / n - mean * mean;
626  float std = var <= 0 ? 1.0 : sqrt(var);
627 
628  vmin = mean - std * rs_arg ;
629  vmax = mean + std * rs_arg ;
630  } else if (rs == ScalarQuantizer::RS_quantiles) {
631  std::vector<float> x_copy(n);
632  memcpy(x_copy.data(), x, n * sizeof(*x));
633  // TODO just do a qucikselect
634  std::sort(x_copy.begin(), x_copy.end());
635  int o = int(rs_arg * n);
636  if (o < 0) o = 0;
637  if (o > n - o) o = n / 2;
638  vmin = x_copy[o];
639  vmax = x_copy[n - 1 - o];
640 
641  } else if (rs == ScalarQuantizer::RS_optim) {
642  float a, b;
643  float sx = 0;
644  {
645  vmin = HUGE_VAL, vmax = -HUGE_VAL;
646  for (size_t i = 0; i < n; i++) {
647  if (x[i] < vmin) vmin = x[i];
648  if (x[i] > vmax) vmax = x[i];
649  sx += x[i];
650  }
651  b = vmin;
652  a = (vmax - vmin) / (k - 1);
653  }
654  int verbose = false;
655  int niter = 2000;
656  float last_err = -1;
657  int iter_last_err = 0;
658  for (int it = 0; it < niter; it++) {
659  float sn = 0, sn2 = 0, sxn = 0, err1 = 0;
660 
661  for (idx_t i = 0; i < n; i++) {
662  float xi = x[i];
663  float ni = floor ((xi - b) / a + 0.5);
664  if (ni < 0) ni = 0;
665  if (ni >= k) ni = k - 1;
666  err1 += sqr (xi - (ni * a + b));
667  sn += ni;
668  sn2 += ni * ni;
669  sxn += ni * xi;
670  }
671 
672  if (err1 == last_err) {
673  iter_last_err ++;
674  if (iter_last_err == 16) break;
675  } else {
676  last_err = err1;
677  iter_last_err = 0;
678  }
679 
680  float det = sqr (sn) - sn2 * n;
681 
682  b = (sn * sxn - sn2 * sx) / det;
683  a = (sn * sx - n * sxn) / det;
684  if (verbose) {
685  printf ("it %d, err1=%g \r", it, err1);
686  fflush(stdout);
687  }
688  }
689  if (verbose) printf("\n");
690 
691  vmin = b;
692  vmax = b + a * (k - 1);
693 
694  } else {
695  FAISS_THROW_MSG ("Invalid qtype");
696  }
697  vmax -= vmin;
698 }
699 
700 void train_NonUniform(RangeStat rs, float rs_arg,
701  idx_t n, int d, int k, const float *x,
702  std::vector<float> & trained)
703 {
704 
705  trained.resize (2 * d);
706  float * vmin = trained.data();
707  float * vmax = trained.data() + d;
708  if (rs == ScalarQuantizer::RS_minmax) {
709  memcpy (vmin, x, sizeof(*x) * d);
710  memcpy (vmax, x, sizeof(*x) * d);
711  for (size_t i = 1; i < n; i++) {
712  const float *xi = x + i * d;
713  for (size_t j = 0; j < d; j++) {
714  if (xi[j] < vmin[j]) vmin[j] = xi[j];
715  if (xi[j] > vmax[j]) vmax[j] = xi[j];
716  }
717  }
718  float *vdiff = vmax;
719  for (size_t j = 0; j < d; j++) {
720  float vexp = (vmax[j] - vmin[j]) * rs_arg;
721  vmin[j] -= vexp;
722  vmax[j] += vexp;
723  vdiff [j] = vmax[j] - vmin[j];
724  }
725  } else {
726  // transpose
727  std::vector<float> xt(n * d);
728  for (size_t i = 1; i < n; i++) {
729  const float *xi = x + i * d;
730  for (size_t j = 0; j < d; j++) {
731  xt[j * n + i] = xi[j];
732  }
733  }
734  std::vector<float> trained_d(2);
735 #pragma omp parallel for
736  for (size_t j = 0; j < d; j++) {
737  train_Uniform(rs, rs_arg,
738  n, k, xt.data() + j * n,
739  trained_d);
740  vmin[j] = trained_d[0];
741  vmax[j] = trained_d[1];
742  }
743  }
744 }
745 
746 
747 
748 /*******************************************************************
749  * Similarity: gets vector components and computes a similarity wrt. a
750  * query vector stored in the object. The data fields just encapsulate
751  * an accumulator.
752  */
753 
754 template<int SIMDWIDTH>
755 struct SimilarityL2 {};
756 
757 
758 template<>
759 struct SimilarityL2<1> {
760  static constexpr int simdwidth = 1;
761  static constexpr MetricType metric_type = METRIC_L2;
762 
763  const float *y, *yi;
764 
765  explicit SimilarityL2 (const float * y): y(y) {}
766 
767  /******* scalar accumulator *******/
768 
769  float accu;
770 
771  void begin () {
772  accu = 0;
773  yi = y;
774  }
775 
776  void add_component (float x) {
777  float tmp = *yi++ - x;
778  accu += tmp * tmp;
779  }
780 
781  void add_component_2 (float x1, float x2) {
782  float tmp = x1 - x2;
783  accu += tmp * tmp;
784  }
785 
786  float result () {
787  return accu;
788  }
789 };
790 
791 
792 #ifdef USE_AVX
793 template<>
794 struct SimilarityL2<8> {
795  static constexpr int simdwidth = 8;
796  static constexpr MetricType metric_type = METRIC_L2;
797 
798  const float *y, *yi;
799 
800  explicit SimilarityL2 (const float * y): y(y) {}
801  __m256 accu8;
802 
803  void begin_8 () {
804  accu8 = _mm256_setzero_ps();
805  yi = y;
806  }
807 
808  void add_8_components (__m256 x) {
809  __m256 yiv = _mm256_loadu_ps (yi);
810  yi += 8;
811  __m256 tmp = yiv - x;
812  accu8 += tmp * tmp;
813  }
814 
815  void add_8_components_2 (__m256 x, __m256 y) {
816  __m256 tmp = y - x;
817  accu8 += tmp * tmp;
818  }
819 
820  float result_8 () {
821  __m256 sum = _mm256_hadd_ps(accu8, accu8);
822  __m256 sum2 = _mm256_hadd_ps(sum, sum);
823  // now add the 0th and 4th component
824  return
825  _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
826  _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
827  }
828 
829 };
830 
831 #endif
832 
833 
834 template<int SIMDWIDTH>
835 struct SimilarityIP {};
836 
837 
838 template<>
839 struct SimilarityIP<1> {
840  static constexpr int simdwidth = 1;
841  static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
842  const float *y, *yi;
843 
844  float accu;
845 
846  explicit SimilarityIP (const float * y):
847  y (y) {}
848 
849  void begin () {
850  accu = 0;
851  yi = y;
852  }
853 
854  void add_component (float x) {
855  accu += *yi++ * x;
856  }
857 
858  void add_component_2 (float x1, float x2) {
859  accu += x1 * x2;
860  }
861 
862  float result () {
863  return accu;
864  }
865 };
866 
867 #ifdef USE_AVX
868 
869 template<>
870 struct SimilarityIP<8> {
871  static constexpr int simdwidth = 8;
872  static constexpr MetricType metric_type = METRIC_INNER_PRODUCT;
873 
874  const float *y, *yi;
875 
876  float accu;
877 
878  explicit SimilarityIP (const float * y):
879  y (y) {}
880 
881  __m256 accu8;
882 
883  void begin_8 () {
884  accu8 = _mm256_setzero_ps();
885  yi = y;
886  }
887 
888  void add_8_components (__m256 x) {
889  __m256 yiv = _mm256_loadu_ps (yi);
890  yi += 8;
891  accu8 += yiv * x;
892  }
893 
894  void add_8_components_2 (__m256 x1, __m256 x2) {
895  accu8 += x1 * x2;
896  }
897 
898  float result_8 () {
899  __m256 sum = _mm256_hadd_ps(accu8, accu8);
900  __m256 sum2 = _mm256_hadd_ps(sum, sum);
901  // now add the 0th and 4th component
902  return
903  _mm_cvtss_f32 (_mm256_castps256_ps128(sum2)) +
904  _mm_cvtss_f32 (_mm256_extractf128_ps(sum2, 1));
905  }
906 };
907 #endif
908 
909 
910 /*******************************************************************
911  * DistanceComputer: combines a similarity and a quantizer to do
912  * code-to-vector or code-to-code comparisons
913  *******************************************************************/
914 
915 template<class Quantizer, class Similarity, int SIMDWIDTH>
916 struct DCTemplate : SQDistanceComputer {};
917 
918 template<class Quantizer, class Similarity>
919 struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer
920 {
921  using Sim = Similarity;
922 
923  Quantizer quant;
924 
925  DCTemplate(size_t d, const std::vector<float> &trained):
926  quant(d, trained)
927  {}
928 
929  float compute_distance(const float* x, const uint8_t* code) const {
930 
931  Similarity sim(x);
932  sim.begin();
933  for (size_t i = 0; i < quant.d; i++) {
934  float xi = quant.reconstruct_component(code, i);
935  sim.add_component(xi);
936  }
937  return sim.result();
938  }
939 
940  float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
941  const {
942  Similarity sim(nullptr);
943  sim.begin();
944  for (size_t i = 0; i < quant.d; i++) {
945  float x1 = quant.reconstruct_component(code1, i);
946  float x2 = quant.reconstruct_component(code2, i);
947  sim.add_component_2(x1, x2);
948  }
949  return sim.result();
950  }
951 
952  void set_query (const float *x) final {
953  q = x;
954  }
955 
956  /// compute distance of vector i to current query
957  float operator () (idx_t i) final {
958  return compute_distance (q, codes + i * code_size);
959  }
960 
961  float symmetric_dis (idx_t i, idx_t j) override {
962  return compute_code_distance (codes + i * code_size,
963  codes + j * code_size);
964  }
965 
966  float query_to_code (const uint8_t * code) const {
967  return compute_distance (q, code);
968  }
969 
970 };
971 
972 #ifdef USE_AVX
973 
974 template<class Quantizer, class Similarity>
975 struct DCTemplate<Quantizer, Similarity, 8> : SQDistanceComputer
976 {
977  using Sim = Similarity;
978 
979  Quantizer quant;
980 
981  DCTemplate(size_t d, const std::vector<float> &trained):
982  quant(d, trained)
983  {}
984 
985  float compute_distance(const float* x, const uint8_t* code) const {
986 
987  Similarity sim(x);
988  sim.begin_8();
989  for (size_t i = 0; i < quant.d; i += 8) {
990  __m256 xi = quant.reconstruct_8_components(code, i);
991  sim.add_8_components(xi);
992  }
993  return sim.result_8();
994  }
995 
996  float compute_code_distance(const uint8_t* code1, const uint8_t* code2)
997  const {
998  Similarity sim(nullptr);
999  sim.begin_8();
1000  for (size_t i = 0; i < quant.d; i += 8) {
1001  __m256 x1 = quant.reconstruct_8_components(code1, i);
1002  __m256 x2 = quant.reconstruct_8_components(code2, i);
1003  sim.add_8_components_2(x1, x2);
1004  }
1005  return sim.result_8();
1006  }
1007 
1008  void set_query (const float *x) final {
1009  q = x;
1010  }
1011 
1012  /// compute distance of vector i to current query
1013  float operator () (idx_t i) final {
1014  return compute_distance (q, codes + i * code_size);
1015  }
1016 
1017  float symmetric_dis (idx_t i, idx_t j) override {
1018  return compute_code_distance (codes + i * code_size,
1019  codes + j * code_size);
1020  }
1021 
1022  float query_to_code (const uint8_t * code) const {
1023  return compute_distance (q, code);
1024  }
1025 
1026 };
1027 
1028 #endif
1029 
1030 
1031 
1032 /*******************************************************************
1033  * DistanceComputerByte: computes distances in the integer domain
1034  *******************************************************************/
1035 
1036 template<class Similarity, int SIMDWIDTH>
1037 struct DistanceComputerByte : SQDistanceComputer {};
1038 
1039 template<class Similarity>
1040 struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
1041  using Sim = Similarity;
1042 
1043  int d;
1044  std::vector<uint8_t> tmp;
1045 
1046  DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
1047  }
1048 
1049  int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
1050  const {
1051  int accu = 0;
1052  for (int i = 0; i < d; i++) {
1053  if (Sim::metric_type == METRIC_INNER_PRODUCT) {
1054  accu += int(code1[i]) * code2[i];
1055  } else {
1056  int diff = int(code1[i]) - code2[i];
1057  accu += diff * diff;
1058  }
1059  }
1060  return accu;
1061  }
1062 
1063  void set_query (const float *x) final {
1064  for (int i = 0; i < d; i++) {
1065  tmp[i] = int(x[i]);
1066  }
1067  }
1068 
1069  int compute_distance(const float* x, const uint8_t* code) {
1070  set_query(x);
1071  return compute_code_distance(tmp.data(), code);
1072  }
1073 
1074  /// compute distance of vector i to current query
1075  float operator () (idx_t i) final {
1076  return compute_distance (q, codes + i * code_size);
1077  }
1078 
1079  float symmetric_dis (idx_t i, idx_t j) override {
1080  return compute_code_distance (codes + i * code_size,
1081  codes + j * code_size);
1082  }
1083 
1084  float query_to_code (const uint8_t * code) const {
1085  return compute_code_distance (tmp.data(), code);
1086  }
1087 
1088 };
1089 
1090 #ifdef USE_AVX
1091 
1092 
1093 template<class Similarity>
1094 struct DistanceComputerByte<Similarity, 8> : SQDistanceComputer {
1095  using Sim = Similarity;
1096 
1097  int d;
1098  std::vector<uint8_t> tmp;
1099 
1100  DistanceComputerByte(int d, const std::vector<float> &): d(d), tmp(d) {
1101  }
1102 
1103  int compute_code_distance(const uint8_t* code1, const uint8_t* code2)
1104  const {
1105  // __m256i accu = _mm256_setzero_ps ();
1106  __m256i accu = _mm256_setzero_si256 ();
1107  for (int i = 0; i < d; i += 16) {
1108  // load 16 bytes, convert to 16 uint16_t
1109  __m256i c1 = _mm256_cvtepu8_epi16
1110  (_mm_loadu_si128((__m128i*)(code1 + i)));
1111  __m256i c2 = _mm256_cvtepu8_epi16
1112  (_mm_loadu_si128((__m128i*)(code2 + i)));
1113  __m256i prod32;
1114  if (Sim::metric_type == METRIC_INNER_PRODUCT) {
1115  prod32 = _mm256_madd_epi16(c1, c2);
1116  } else {
1117  __m256i diff = _mm256_sub_epi16(c1, c2);
1118  prod32 = _mm256_madd_epi16(diff, diff);
1119  }
1120  accu = _mm256_add_epi32 (accu, prod32);
1121 
1122  }
1123  __m128i sum = _mm256_extractf128_si256(accu, 0);
1124  sum = _mm_add_epi32 (sum, _mm256_extractf128_si256(accu, 1));
1125  sum = _mm_hadd_epi32 (sum, sum);
1126  sum = _mm_hadd_epi32 (sum, sum);
1127  return _mm_cvtsi128_si32 (sum);
1128  }
1129 
1130  void set_query (const float *x) final {
1131  /*
1132  for (int i = 0; i < d; i += 8) {
1133  __m256 xi = _mm256_loadu_ps (x + i);
1134  __m256i ci = _mm256_cvtps_epi32(xi);
1135  */
1136  for (int i = 0; i < d; i++) {
1137  tmp[i] = int(x[i]);
1138  }
1139  }
1140 
1141  int compute_distance(const float* x, const uint8_t* code) {
1142  set_query(x);
1143  return compute_code_distance(tmp.data(), code);
1144  }
1145 
1146  /// compute distance of vector i to current query
1147  float operator () (idx_t i) final {
1148  return compute_distance (q, codes + i * code_size);
1149  }
1150 
1151  float symmetric_dis (idx_t i, idx_t j) override {
1152  return compute_code_distance (codes + i * code_size,
1153  codes + j * code_size);
1154  }
1155 
1156  float query_to_code (const uint8_t * code) const {
1157  return compute_code_distance (tmp.data(), code);
1158  }
1159 
1160 
1161 };
1162 
1163 #endif
1164 
1165 /*******************************************************************
1166  * select_distance_computer: runtime selection of template
1167  * specialization
1168  *******************************************************************/
1169 
1170 
1171 template<class Sim>
1172 SQDistanceComputer *select_distance_computer (
1173  QuantizerType qtype,
1174  size_t d, const std::vector<float> & trained)
1175 {
1176  constexpr int SIMDWIDTH = Sim::simdwidth;
1177  switch(qtype) {
1179  return new DCTemplate<QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
1180  Sim, SIMDWIDTH>(d, trained);
1181 
1182  case ScalarQuantizer::QT_4bit_uniform:
1183  return new DCTemplate<QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
1184  Sim, SIMDWIDTH>(d, trained);
1185 
1187  return new DCTemplate<QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
1188  Sim, SIMDWIDTH>(d, trained);
1189 
1191  return new DCTemplate<QuantizerTemplate<Codec6bit, false, SIMDWIDTH>,
1192  Sim, SIMDWIDTH>(d, trained);
1193 
1195  return new DCTemplate<QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
1196  Sim, SIMDWIDTH>(d, trained);
1197 
1198  case ScalarQuantizer::QT_fp16:
1199  return new DCTemplate
1200  <QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
1201 
1202  case ScalarQuantizer::QT_8bit_direct:
1203  if (d % 16 == 0) {
1204  return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
1205  } else {
1206  return new DCTemplate
1207  <Quantizer8bitDirect<SIMDWIDTH>, Sim, SIMDWIDTH>(d, trained);
1208  }
1209  }
1210  FAISS_THROW_MSG ("unknown qtype");
1211  return nullptr;
1212 }
1213 
1214 
1215 
1216 } // anonymous namespace
1217 
1218 
1219 
1220 /*******************************************************************
1221  * ScalarQuantizer implementation
1222  ********************************************************************/
1223 
1224 ScalarQuantizer::ScalarQuantizer
1225  (size_t d, QuantizerType qtype):
1226  qtype (qtype), rangestat(RS_minmax), rangestat_arg(0), d (d)
1227 {
1228  switch (qtype) {
1229  case QT_8bit:
1230  case QT_8bit_uniform:
1231  case QT_8bit_direct:
1232  code_size = d;
1233  break;
1234  case QT_4bit:
1235  case QT_4bit_uniform:
1236  code_size = (d + 1) / 2;
1237  break;
1238  case QT_6bit:
1239  code_size = (d * 6 + 7) / 8;
1240  break;
1241  case QT_fp16:
1242  code_size = d * 2;
1243  break;
1244  }
1245 
1246 }
1247 
1248 ScalarQuantizer::ScalarQuantizer ():
1249  qtype(QT_8bit),
1250  rangestat(RS_minmax), rangestat_arg(0), d (0), code_size(0)
1251 {}
1252 
1253 void ScalarQuantizer::train (size_t n, const float *x)
1254 {
1255  int bit_per_dim =
1256  qtype == QT_4bit_uniform ? 4 :
1257  qtype == QT_4bit ? 4 :
1258  qtype == QT_6bit ? 6 :
1259  qtype == QT_8bit_uniform ? 8 :
1260  qtype == QT_8bit ? 8 : -1;
1261 
1262  switch (qtype) {
1263  case QT_4bit_uniform: case QT_8bit_uniform:
1264  train_Uniform (rangestat, rangestat_arg,
1265  n * d, 1 << bit_per_dim, x, trained);
1266  break;
1267  case QT_4bit: case QT_8bit: case QT_6bit:
1268  train_NonUniform (rangestat, rangestat_arg,
1269  n, d, 1 << bit_per_dim, x, trained);
1270  break;
1271  case QT_fp16:
1272  case QT_8bit_direct:
1273  // no training necessary
1274  break;
1275  }
1276 }
1277 
1278 void ScalarQuantizer::compute_codes (const float * x,
1279  uint8_t * codes,
1280  size_t n) const
1281 {
1282  Quantizer *squant = select_quantizer (*this);
1283  ScopeDeleter1<Quantizer> del(squant);
1284  memset (codes, 0, code_size * n);
1285 #pragma omp parallel for
1286  for (size_t i = 0; i < n; i++)
1287  squant->encode_vector (x + i * d, codes + i * code_size);
1288 }
1289 
1290 void ScalarQuantizer::decode (const uint8_t *codes, float *x, size_t n) const
1291 {
1292  Quantizer *squant = select_quantizer (*this);
1293  ScopeDeleter1<Quantizer> del(squant);
1294 #pragma omp parallel for
1295  for (size_t i = 0; i < n; i++)
1296  squant->decode_vector (codes + i * code_size, x + i * d);
1297 }
1298 
1299 
1301 ScalarQuantizer::get_distance_computer (MetricType metric) const
1302 {
1303 #ifdef USE_AVX
1304  if (d % 8 == 0) {
1305  if (metric == METRIC_L2) {
1306  return select_distance_computer<SimilarityL2<8> >
1307  (qtype, d, trained);
1308  } else {
1309  return select_distance_computer<SimilarityIP<8> >
1310  (qtype, d, trained);
1311  }
1312  } else
1313 #endif
1314  {
1315  if (metric == METRIC_L2) {
1316  return select_distance_computer<SimilarityL2<1> >
1317  (qtype, d, trained);
1318  } else {
1319  return select_distance_computer<SimilarityIP<1> >
1320  (qtype, d, trained);
1321  }
1322  }
1323 }
1324 
1325 
1326 /*******************************************************************
1327  * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
1328  *
1329  * It is an InvertedListScanner, but is designed to work with
1330  * IndexScalarQuantizer as well.
1331  ********************************************************************/
1332 
1333 namespace {
1334 
1335 
1336 template<class DCClass>
1337 struct IVFSQScannerIP: InvertedListScanner {
1338  DCClass dc;
1339  bool store_pairs, by_residual;
1340 
1341  size_t code_size;
1342 
1343  idx_t list_no; /// current list (set to 0 for Flat index
1344  float accu0; /// added to all distances
1345 
1346  IVFSQScannerIP(int d, const std::vector<float> & trained,
1347  size_t code_size, bool store_pairs,
1348  bool by_residual):
1349  dc(d, trained), store_pairs(store_pairs),
1350  by_residual(by_residual),
1351  code_size(code_size), list_no(0), accu0(0)
1352  {}
1353 
1354 
1355  void set_query (const float *query) override {
1356  dc.set_query (query);
1357  }
1358 
1359  void set_list (idx_t list_no, float coarse_dis) override {
1360  this->list_no = list_no;
1361  accu0 = by_residual ? coarse_dis : 0;
1362  }
1363 
1364  float distance_to_code (const uint8_t *code) const final {
1365  return accu0 + dc.query_to_code (code);
1366  }
1367 
1368  size_t scan_codes (size_t list_size,
1369  const uint8_t *codes,
1370  const idx_t *ids,
1371  float *simi, idx_t *idxi,
1372  size_t k) const override
1373  {
1374  size_t nup = 0;
1375 
1376  for (size_t j = 0; j < list_size; j++) {
1377 
1378  float accu = accu0 + dc.query_to_code (codes);
1379 
1380  if (accu > simi [0]) {
1381  minheap_pop (k, simi, idxi);
1382  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1383  minheap_push (k, simi, idxi, accu, id);
1384  nup++;
1385  }
1386  codes += code_size;
1387  }
1388  return nup;
1389  }
1390 
1391  void scan_codes_range (size_t list_size,
1392  const uint8_t *codes,
1393  const idx_t *ids,
1394  float radius,
1395  RangeQueryResult & res) const override
1396  {
1397  for (size_t j = 0; j < list_size; j++) {
1398  float accu = accu0 + dc.query_to_code (codes);
1399  if (accu > radius) {
1400  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1401  res.add (accu, id);
1402  }
1403  codes += code_size;
1404  }
1405  }
1406 
1407 
1408 };
1409 
1410 
1411 template<class DCClass>
1412 struct IVFSQScannerL2: InvertedListScanner {
1413 
1414  DCClass dc;
1415 
1416  bool store_pairs, by_residual;
1417  size_t code_size;
1418  const Index *quantizer;
1419  idx_t list_no; /// current inverted list
1420  const float *x; /// current query
1421 
1422  std::vector<float> tmp;
1423 
1424  IVFSQScannerL2(int d, const std::vector<float> & trained,
1425  size_t code_size, const Index *quantizer,
1426  bool store_pairs, bool by_residual):
1427  dc(d, trained), store_pairs(store_pairs), by_residual(by_residual),
1428  code_size(code_size), quantizer(quantizer),
1429  list_no (0), x (nullptr), tmp (d)
1430  {
1431  }
1432 
1433 
1434  void set_query (const float *query) override {
1435  x = query;
1436  if (!quantizer) {
1437  dc.set_query (query);
1438  }
1439  }
1440 
1441 
1442  void set_list (idx_t list_no, float /*coarse_dis*/) override {
1443  if (by_residual) {
1444  this->list_no = list_no;
1445  // shift of x_in wrt centroid
1446  quantizer->compute_residual (x, tmp.data(), list_no);
1447  dc.set_query (tmp.data ());
1448  } else {
1449  dc.set_query (x);
1450  }
1451  }
1452 
1453  float distance_to_code (const uint8_t *code) const final {
1454  return dc.query_to_code (code);
1455  }
1456 
1457  size_t scan_codes (size_t list_size,
1458  const uint8_t *codes,
1459  const idx_t *ids,
1460  float *simi, idx_t *idxi,
1461  size_t k) const override
1462  {
1463  size_t nup = 0;
1464  for (size_t j = 0; j < list_size; j++) {
1465 
1466  float dis = dc.query_to_code (codes);
1467 
1468  if (dis < simi [0]) {
1469  maxheap_pop (k, simi, idxi);
1470  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1471  maxheap_push (k, simi, idxi, dis, id);
1472  nup++;
1473  }
1474  codes += code_size;
1475  }
1476  return nup;
1477  }
1478 
1479  void scan_codes_range (size_t list_size,
1480  const uint8_t *codes,
1481  const idx_t *ids,
1482  float radius,
1483  RangeQueryResult & res) const override
1484  {
1485  for (size_t j = 0; j < list_size; j++) {
1486  float dis = dc.query_to_code (codes);
1487  if (dis < radius) {
1488  long id = store_pairs ? (list_no << 32 | j) : ids[j];
1489  res.add (dis, id);
1490  }
1491  codes += code_size;
1492  }
1493  }
1494 
1495 
1496 };
1497 
1498 template<class DCClass>
1499 InvertedListScanner* sel2_InvertedListScanner
1500  (const ScalarQuantizer *sq,
1501  const Index *quantizer, bool store_pairs, bool r)
1502 {
1503  if (DCClass::Sim::metric_type == METRIC_L2) {
1504  return new IVFSQScannerL2<DCClass>(sq->d, sq->trained, sq->code_size,
1505  quantizer, store_pairs, r);
1506  } else {
1507  return new IVFSQScannerIP<DCClass>(sq->d, sq->trained, sq->code_size,
1508  store_pairs, r);
1509  }
1510 }
1511 
1512 template<class Similarity, class Codec, bool uniform>
1513 InvertedListScanner* sel12_InvertedListScanner
1514  (const ScalarQuantizer *sq,
1515  const Index *quantizer, bool store_pairs, bool r)
1516 {
1517  constexpr int SIMDWIDTH = Similarity::simdwidth;
1518  using QuantizerClass = QuantizerTemplate<Codec, uniform, SIMDWIDTH>;
1519  using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
1520  return sel2_InvertedListScanner<DCClass> (sq, quantizer, store_pairs, r);
1521 }
1522 
1523 
1524 
1525 template<class Similarity>
1526 InvertedListScanner* sel1_InvertedListScanner
1527  (const ScalarQuantizer *sq, const Index *quantizer,
1528  bool store_pairs, bool r)
1529 {
1530  constexpr int SIMDWIDTH = Similarity::simdwidth;
1531  switch(sq->qtype) {
1533  return sel12_InvertedListScanner
1534  <Similarity, Codec8bit, true>(sq, quantizer, store_pairs, r);
1535  case ScalarQuantizer::QT_4bit_uniform:
1536  return sel12_InvertedListScanner
1537  <Similarity, Codec4bit, true>(sq, quantizer, store_pairs, r);
1539  return sel12_InvertedListScanner
1540  <Similarity, Codec8bit, false>(sq, quantizer, store_pairs, r);
1542  return sel12_InvertedListScanner
1543  <Similarity, Codec4bit, false>(sq, quantizer, store_pairs, r);
1545  return sel12_InvertedListScanner
1546  <Similarity, Codec6bit, false>(sq, quantizer, store_pairs, r);
1547  case ScalarQuantizer::QT_fp16:
1548  return sel2_InvertedListScanner
1549  <DCTemplate<QuantizerFP16<SIMDWIDTH>, Similarity, SIMDWIDTH> >
1550  (sq, quantizer, store_pairs, r);
1551  case ScalarQuantizer::QT_8bit_direct:
1552  if (sq->d % 16 == 0) {
1553  return sel2_InvertedListScanner
1554  <DistanceComputerByte<Similarity, SIMDWIDTH> >
1555  (sq, quantizer, store_pairs, r);
1556  } else {
1557  return sel2_InvertedListScanner
1558  <DCTemplate<Quantizer8bitDirect<SIMDWIDTH>,
1559  Similarity, SIMDWIDTH> >
1560  (sq, quantizer, store_pairs, r);
1561  }
1562 
1563  }
1564 
1565  FAISS_THROW_MSG ("unknown qtype");
1566  return nullptr;
1567 }
1568 
1569 template<int SIMDWIDTH>
1570 InvertedListScanner* sel0_InvertedListScanner
1571  (MetricType mt, const ScalarQuantizer *sq,
1572  const Index *quantizer, bool store_pairs, bool by_residual)
1573 {
1574  if (mt == METRIC_L2) {
1575  return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH> >
1576  (sq, quantizer, store_pairs, by_residual);
1577  } else {
1578  return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH> >
1579  (sq, quantizer, store_pairs, by_residual);
1580  }
1581 }
1582 
1583 
1584 InvertedListScanner* select_InvertedListScanner
1585  (MetricType mt, const ScalarQuantizer *sq,
1586  const Index *quantizer, bool store_pairs, bool by_residual=false)
1587 {
1588 #ifdef USE_AVX
1589  if (sq->d % 8 == 0) {
1590  return sel0_InvertedListScanner<8>
1591  (mt, sq, quantizer, store_pairs, by_residual);
1592  } else
1593 #endif
1594  {
1595  return sel0_InvertedListScanner<1>
1596  (mt, sq, quantizer, store_pairs, by_residual);
1597  }
1598 }
1599 
1600 
1601 } // anonymous namespace
1602 
1603 
1604 /*******************************************************************
1605  * IndexScalarQuantizer implementation
1606  ********************************************************************/
1607 
1608 IndexScalarQuantizer::IndexScalarQuantizer
1610  MetricType metric):
1611  Index(d, metric),
1612  sq (d, qtype)
1613 {
1614  is_trained =
1615  qtype == ScalarQuantizer::QT_fp16 ||
1616  qtype == ScalarQuantizer::QT_8bit_direct;
1617  code_size = sq.code_size;
1618 }
1619 
1620 
1621 IndexScalarQuantizer::IndexScalarQuantizer ():
1623 {}
1624 
1625 void IndexScalarQuantizer::train(idx_t n, const float* x)
1626 {
1627  sq.train(n, x);
1628  is_trained = true;
1629 }
1630 
1631 void IndexScalarQuantizer::add(idx_t n, const float* x)
1632 {
1633  FAISS_THROW_IF_NOT (is_trained);
1634  codes.resize ((n + ntotal) * code_size);
1635  sq.compute_codes (x, &codes[ntotal * code_size], n);
1636  ntotal += n;
1637 }
1638 
1639 
1641  idx_t n,
1642  const float* x,
1643  idx_t k,
1644  float* distances,
1645  idx_t* labels) const
1646 {
1647  FAISS_THROW_IF_NOT (is_trained);
1648 
1649 #pragma omp parallel
1650  {
1651  InvertedListScanner* scanner = select_InvertedListScanner
1652  (metric_type, &sq, nullptr, true);
1654 
1655 #pragma omp for
1656  for (size_t i = 0; i < n; i++) {
1657  float * D = distances + k * i;
1658  idx_t * I = labels + k * i;
1659  // re-order heap
1660  if (metric_type == METRIC_L2) {
1661  maxheap_heapify (k, D, I);
1662  } else {
1663  minheap_heapify (k, D, I);
1664  }
1665  scanner->set_query (x + i * d);
1666  scanner->scan_codes (ntotal, codes.data(),
1667  nullptr, D, I, k);
1668 
1669  // re-order heap
1670  if (metric_type == METRIC_L2) {
1671  maxheap_reorder (k, D, I);
1672  } else {
1673  minheap_reorder (k, D, I);
1674  }
1675  }
1676  }
1677 
1678 }
1679 
1680 
1681 DistanceComputer *IndexScalarQuantizer::get_distance_computer () const
1682 {
1683  SQDistanceComputer *dc = sq.get_distance_computer (metric_type);
1684  dc->code_size = sq.code_size;
1685  dc->codes = codes.data();
1686  return dc;
1687 }
1688 
1689 
1691 {
1692  codes.clear();
1693  ntotal = 0;
1694 }
1695 
1697  idx_t i0, idx_t ni, float* recons) const
1698 {
1699  Quantizer *squant = select_quantizer (sq);
1700  ScopeDeleter1<Quantizer> del (squant);
1701  for (size_t i = 0; i < ni; i++) {
1702  squant->decode_vector(&codes[(i + i0) * code_size], recons + i * d);
1703  }
1704 }
1705 
1706 void IndexScalarQuantizer::reconstruct(idx_t key, float* recons) const
1707 {
1708  reconstruct_n(key, 1, recons);
1709 }
1710 
1711 
1712 /*******************************************************************
1713  * IndexIVFScalarQuantizer implementation
1714  ********************************************************************/
1715 
1716 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer
1717  (Index *quantizer, size_t d, size_t nlist,
1718  QuantizerType qtype, MetricType metric):
1719  IndexIVF (quantizer, d, nlist, 0, metric),
1720  sq (d, qtype)
1721 {
1722  code_size = sq.code_size;
1723  // was not known at construction time
1724  invlists->code_size = code_size;
1725  is_trained = false;
1726  by_residual = true;
1727 }
1728 
1729 IndexIVFScalarQuantizer::IndexIVFScalarQuantizer ():
1730  IndexIVF ()
1731 {
1732  by_residual = true;
1733 }
1734 
1735 void IndexIVFScalarQuantizer::train_residual (idx_t n, const float *x)
1736 {
1737  const float * x_in = x;
1738 
1739  // 100k points more than enough
1740  x = fvecs_maybe_subsample (
1741  d, (size_t*)&n, 100000,
1742  x, verbose, 1234);
1743 
1744  ScopeDeleter<float> del_x (x_in == x ? nullptr : x);
1745 
1746  if (by_residual) {
1747  long * idx = new long [n];
1748  ScopeDeleter<long> del (idx);
1749  quantizer->assign (n, x, idx);
1750  float *residuals = new float [n * d];
1751  ScopeDeleter<float> del2 (residuals);
1752 
1753 #pragma omp parallel for
1754  for (idx_t i = 0; i < n; i++) {
1755  quantizer->compute_residual (x + i * d, residuals + i * d, idx[i]);
1756  }
1757  sq.train (n, residuals);
1758  } else {
1759  sq.train (n, x);
1760  }
1761 
1762 }
1763 
1764 void IndexIVFScalarQuantizer::encode_vectors(idx_t n, const float* x,
1765  const idx_t *list_nos,
1766  uint8_t * codes) const
1767 {
1768  Quantizer *squant = select_quantizer (sq);
1769  ScopeDeleter1<Quantizer> del (squant);
1770  memset(codes, 0, code_size * n);
1771 
1772 #pragma omp parallel
1773  {
1774  std::vector<float> residual (d);
1775 
1776  // each thread takes care of a subset of lists
1777 #pragma omp for
1778  for (size_t i = 0; i < n; i++) {
1779  long list_no = list_nos [i];
1780  if (list_no >= 0) {
1781  const float *xi = x + i * d;
1782  if (by_residual) {
1783  quantizer->compute_residual (
1784  xi, residual.data(), list_no);
1785  xi = residual.data ();
1786  }
1787  squant->encode_vector (xi, codes + i * code_size);
1788  }
1789  }
1790  }
1791 }
1792 
1793 
1794 
1796  (idx_t n, const float * x, const long *xids)
1797 {
1798  FAISS_THROW_IF_NOT (is_trained);
1799  long * idx = new long [n];
1800  ScopeDeleter<long> del (idx);
1801  quantizer->assign (n, x, idx);
1802  size_t nadd = 0;
1803  Quantizer *squant = select_quantizer (sq);
1804  ScopeDeleter1<Quantizer> del2 (squant);
1805 
1806 #pragma omp parallel reduction(+: nadd)
1807  {
1808  std::vector<float> residual (d);
1809  std::vector<uint8_t> one_code (code_size);
1810  int nt = omp_get_num_threads();
1811  int rank = omp_get_thread_num();
1812 
1813  // each thread takes care of a subset of lists
1814  for (size_t i = 0; i < n; i++) {
1815  long list_no = idx [i];
1816  if (list_no >= 0 && list_no % nt == rank) {
1817  long id = xids ? xids[i] : ntotal + i;
1818 
1819  const float * xi = x + i * d;
1820  if (by_residual) {
1821  quantizer->compute_residual (xi, residual.data(), list_no);
1822  xi = residual.data();
1823  }
1824 
1825  memset (one_code.data(), 0, code_size);
1826  squant->encode_vector (xi, one_code.data());
1827 
1828  invlists->add_entry (list_no, id, one_code.data());
1829 
1830  nadd++;
1831 
1832  }
1833  }
1834  }
1835  ntotal += n;
1836 }
1837 
1838 
1839 
1840 
1841 
1843  (bool store_pairs) const
1844 {
1845  return select_InvertedListScanner (metric_type, &sq, quantizer, store_pairs,
1846  by_residual);
1847 }
1848 
1849 
1851  long offset,
1852  float* recons) const
1853 {
1854  std::vector<float> centroid(d);
1855  quantizer->reconstruct (list_no, centroid.data());
1856 
1857  const uint8_t* code = invlists->get_single_code (list_no, offset);
1858  sq.decode (code, recons, 1);
1859  for (int i = 0; i < d; ++i) {
1860  recons[i] += centroid[i];
1861  }
1862 }
1863 
1864 } // namespace faiss
void encode_vectors(idx_t n, const float *x, const idx_t *list_nos, uint8_t *codes) const override
size_t code_size
bytes per vector
void search(idx_t n, const float *x, idx_t k, float *distances, idx_t *labels) const override
void train_residual(idx_t n, const float *x) override
alternate optimization of reconstruction error
same, shared range for all dimensions
void reconstruct_from_offset(long list_no, long offset, float *recons) const override
void add(idx_t n, const float *x) override
const float * fvecs_maybe_subsample(size_t d, size_t *n, size_t nmax, const float *x, bool verbose, long seed)
Definition: utils.cpp:1540
void reset() override
removes all elements from the database.
void add_with_ids(idx_t n, const float *x, const long *xids) override
default implementation that calls encode_vectors
int d
vector dimension
Definition: Index.h:66
long idx_t
all indices are this type
Definition: Index.h:62
virtual const uint8_t * get_single_code(size_t list_no, size_t offset) const
virtual size_t scan_codes(size_t n, const uint8_t *codes, const idx_t *ids, float *distances, idx_t *labels, size_t k) const =0
idx_t ntotal
total nb of indexed vectors
Definition: Index.h:67
[mean - std * rs, mean + std * rs]
void decode(const uint8_t *code, float *x, size_t n) const
decode a vector from a given code (or n vectors if third argument)
InvertedListScanner * get_InvertedListScanner(bool store_pairs) const override
get a scanner for this index (store_pairs means ignore labels)
void compute_codes(const float *x, uint8_t *codes, size_t n) const
same as compute_code for several vectors
MetricType metric_type
type of metric this index uses for search
Definition: Index.h:74
InvertedLists * invlists
Acess to the actual data.
Definition: IndexIVF.h:92
void reconstruct_n(idx_t i0, idx_t ni, float *recons) const override
void reconstruct(idx_t key, float *recons) const override
[min - rs*(max-min), max + rs*(max-min)]
std::vector< float > trained
trained values (including the range)
bool is_trained
set if the Index does not require training, or if training is done already
Definition: Index.h:71
void train(idx_t n, const float *x) override
size_t d
dimension of input vectors
virtual void set_query(const float *query_vector)=0
from now on we handle this query.
size_t code_size
code size per vector in bytes
Definition: IndexIVF.h:95
MetricType
Some algorithms support both an inner product version and a L2 search version.
Definition: Index.h:44