Additional C++ templates for fast sa_decode: add overloads for identical coarse and fine tables (#2458)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2458

Add overloads for ::accum() for the case of each code sharing coarse quantizer centroids table and fine quantizer centroids table

Reviewed By: mdouze

Differential Revision: D39314206

fbshipit-source-id: 170a0a1c434e00c95c98151e026d1e30ac017149
pull/2461/head
Alexandr Guzhva 2022-09-08 09:53:46 -07:00 committed by Facebook GitHub Bot
parent c740091662
commit 2cd84aa663
10 changed files with 2860 additions and 61 deletions

View File

@ -114,7 +114,8 @@
// superscalar architecture. Doing more vectors per call is less attractive
// because of the possible lack of available CPU registers, but it is still
// doable.
// The method signature is the following:
// If each code uses its own coarse quantizer centroids table and its own fine
// quantizer centroids table, then the following overload can be used:
// {
// static void accum(
// const float* const __restrict pqCoarseCentroids0,
@ -127,9 +128,23 @@
// const float weight1,
// float* const __restrict outputAccum);
// }
// If codes share the coarse quantizer centroids table and also share
// the fine quantizer centroids table, then the following overload can be
// used:
// {
// static void accum(
// const float* const __restrict pqCoarseCentroids,
// const float* const __restrict pqFineCentroids,
// const uint8_t* const __restrict code0,
// const float weight0,
// const uint8_t* const __restrict code1,
// const float weight1,
// float* const __restrict outputAccum);
// }
// * And one more overload for ::accum that decodes and accumulates
// three vectors per call. Sometimes, it makes sense, at least for AVX2.
// The method signature is the following:
// If each code uses its own coarse quantizer centroids table and its own fine
// quantizer centroids table, then the following overload can be used:
// {
// static void accum(
// const float* const __restrict pqCoarseCentroids0,
@ -146,6 +161,22 @@
// const float weight2,
// float* const __restrict outputAccum);
// }
// If codes share the coarse quantizer centroids table and also share
// the fine quantizer centroids table, then the following overload can be
// used:
// {
// static void accum(
// const float* const __restrict pqCoarseCentroids,
// const float* const __restrict pqFineCentroids,
// const uint8_t* const __restrict code0,
// const float weight0,
// const uint8_t* const __restrict code1,
// const float weight1,
// const uint8_t* const __restrict code2,
// const float weight2,
// float* const __restrict outputAccum);
// }
//
// The provided version is not multithreaded.
//
// Currently, an AVX2+FMA implementation is available. AVX512 version is also

View File

@ -277,7 +277,9 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -335,7 +337,68 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -409,6 +472,80 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
template <
@ -520,7 +657,9 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -573,7 +712,63 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 8 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -640,6 +835,73 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 8 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
template <
@ -751,7 +1013,9 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -804,7 +1068,63 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -871,6 +1191,73 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
pqCoarseCentroids, pqFineCentroids,
code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
// This partial specialization is expected to do nothing.
@ -911,7 +1298,8 @@ struct Index2LevelDecoderImpl<
const float weight0,
float* const __restrict outputAccum) {}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -923,7 +1311,19 @@ struct Index2LevelDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -939,6 +1339,19 @@ struct Index2LevelDecoderImpl<
const float weight2,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// clang-format on
};
} // namespace
@ -1005,9 +1418,12 @@ struct Index2LevelDecoder {
outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
// decoded(code1).
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1036,7 +1452,40 @@ struct Index2LevelDecoder {
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
outputAccum);
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
@ -1074,6 +1523,40 @@ struct Index2LevelDecoder {
weight2,
outputAccum);
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -122,9 +122,12 @@ struct Index2LevelDecoder {
}
}
// process 2 samples
// Performs
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1).
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -181,10 +184,72 @@ struct Index2LevelDecoder {
}
}
// process 3 samples
// Performs
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
// + weight2 * decoded(code2)
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const coarse_storage_type* const __restrict coarse0 =
reinterpret_cast<const coarse_storage_type*>(code0);
const coarse_storage_type* const __restrict coarse1 =
reinterpret_cast<const coarse_storage_type*>(code1);
// fine quantizer
const uint8_t* const __restrict fine0 =
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
const uint8_t* const __restrict fine1 =
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr0 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict coarsePtr1 = pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
weight1 * (*coarsePtr1 + *finePtr1);
}
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -259,6 +324,83 @@ struct Index2LevelDecoder {
weight2 * (*coarsePtr2 + *finePtr2);
}
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const coarse_storage_type* const __restrict coarse0 =
reinterpret_cast<const coarse_storage_type*>(code0);
const coarse_storage_type* const __restrict coarse1 =
reinterpret_cast<const coarse_storage_type*>(code1);
const coarse_storage_type* const __restrict coarse2 =
reinterpret_cast<const coarse_storage_type*>(code2);
// fine quantizer
const uint8_t* const __restrict fine0 =
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
const uint8_t* const __restrict fine1 =
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
const uint8_t* const __restrict fine2 =
code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
const intptr_t fineCode2 = fine2[fineCentroidIdx];
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr0 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict coarsePtr1 = pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict coarsePtr2 = pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr2 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
weight1 * (*coarsePtr1 + *finePtr1) +
weight2 * (*coarsePtr2 + *finePtr2);
}
}
};
} // namespace cppcontrib

View File

@ -329,7 +329,9 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -431,7 +433,109 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
outputAccum);
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -569,6 +673,139 @@ struct Index2LevelDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine2);
const intptr_t fineCode2b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine2);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode2a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode2b) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
template <
@ -709,7 +946,9 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -792,7 +1031,90 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 8 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
outputAccum);
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -902,6 +1224,111 @@ struct Index2LevelDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 8 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
template <
@ -1039,7 +1466,9 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1120,7 +1549,88 @@ struct Index2LevelDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 4>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
outputAccum);
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1228,6 +1738,109 @@ struct Index2LevelDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 4>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
// This partial specialization is expected to do nothing.
@ -1266,7 +1879,9 @@ struct Index2LevelDecoderImpl<
const float weight0,
float* const __restrict outputAccum) {}
// process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1278,7 +1893,21 @@ struct Index2LevelDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1293,6 +1922,20 @@ struct Index2LevelDecoderImpl<
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
};
} // namespace
@ -1358,9 +2001,12 @@ struct Index2LevelDecoder {
outputAccum);
}
// process 2 samples
// Process 2 samples.
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
// decoded(code1).
//
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1389,9 +2035,42 @@ struct Index2LevelDecoder {
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
//
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
outputAccum);
}
// Process 3 samples.
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
//
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
@ -1427,6 +2106,40 @@ struct Index2LevelDecoder {
weight2,
outputAccum);
}
// Process 3 samples.
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
//
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids,
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -97,7 +97,10 @@ struct IndexMinMaxDecoder {
minvAccum += minv;
}
// Process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -137,7 +140,48 @@ struct IndexMinMaxDecoder {
minvAccum += minv0 + minv1;
}
// Process 2 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// * minvAccum += weight0 * minv0 + weight1 * minv1
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum,
float& minvAccum) {
const float* const __restrict code0Float =
reinterpret_cast<const float*>(code0);
const float scaler0 = code0Float[0] * weight0;
const float minv0 = code0Float[1] * weight0;
const float* const __restrict code1Float =
reinterpret_cast<const float*>(code1);
const float scaler1 = code1Float[0] * weight1;
const float minv1 = code1Float[1] * weight1;
SubIndexT::accum(
pqCoarseCentroids,
pqFineCentroids,
code0 + 2 * sizeof(float),
scaler0,
code1 + 2 * sizeof(float),
scaler1,
outputAccum);
minvAccum += minv0 + minv1;
}
// Process 2 samples.
// Each code uses its own fine pq centroids table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -173,7 +217,46 @@ struct IndexMinMaxDecoder {
minvAccum += minv0 + minv1;
}
// Process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// * minvAccum += weight0 * minv0 + weight1 * minv1
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum,
float& minvAccum) {
const float* const __restrict code0Float =
reinterpret_cast<const float*>(code0);
const float scaler0 = code0Float[0] * weight0;
const float minv0 = code0Float[1] * weight0;
const float* const __restrict code1Float =
reinterpret_cast<const float*>(code1);
const float scaler1 = code1Float[0] * weight1;
const float minv1 = code1Float[1] * weight1;
SubIndexT::accum(
pqFineCentroids,
code0 + 2 * sizeof(float),
scaler0,
code1 + 2 * sizeof(float),
scaler1,
outputAccum);
minvAccum += minv0 + minv1;
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -227,7 +310,58 @@ struct IndexMinMaxDecoder {
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// + weight2 * scaler2 * decoded(code2)
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum,
float& minvAccum) {
const float* const __restrict code0Float =
reinterpret_cast<const float*>(code0);
const float scaler0 = code0Float[0] * weight0;
const float minv0 = code0Float[1] * weight0;
const float* const __restrict code1Float =
reinterpret_cast<const float*>(code1);
const float scaler1 = code1Float[0] * weight1;
const float minv1 = code1Float[1] * weight1;
const float* const __restrict code2Float =
reinterpret_cast<const float*>(code2);
const float scaler2 = code2Float[0] * weight2;
const float minv2 = code2Float[1] * weight2;
SubIndexT::accum(
pqCoarseCentroids,
pqFineCentroids,
code0 + 2 * sizeof(float),
scaler0,
code1 + 2 * sizeof(float),
scaler1,
code2 + 2 * sizeof(float),
scaler2,
outputAccum);
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -274,6 +408,52 @@ struct IndexMinMaxDecoder {
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// + weight2 * scaler2 * decoded(code2)
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum,
float& minvAccum) {
const float* const __restrict code0Float =
reinterpret_cast<const float*>(code0);
const float scaler0 = code0Float[0] * weight0;
const float minv0 = code0Float[1] * weight0;
const float* const __restrict code1Float =
reinterpret_cast<const float*>(code1);
const float scaler1 = code1Float[0] * weight1;
const float minv1 = code1Float[1] * weight1;
const float* const __restrict code2Float =
reinterpret_cast<const float*>(code2);
const float scaler2 = code2Float[0] * weight2;
const float minv2 = code2Float[1] * weight2;
SubIndexT::accum(
pqFineCentroids,
code0 + 2 * sizeof(float),
scaler0,
code1 + 2 * sizeof(float),
scaler1,
code2 + 2 * sizeof(float),
scaler2,
outputAccum);
minvAccum += minv0 + minv1 + minv2;
}
};
} // namespace cppcontrib

View File

@ -102,7 +102,10 @@ struct IndexMinMaxFP16Decoder {
minvAccum += minv;
}
// Process 2 samples
// Process 2 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -142,7 +145,48 @@ struct IndexMinMaxFP16Decoder {
minvAccum += minv0 + minv1;
}
// Process 2 samples
// Process 2 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// * minvAccum += weight0 * minv0 + weight1 * minv1
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum,
float& minvAccum) {
const uint16_t* const __restrict code0FP16 =
reinterpret_cast<const uint16_t*>(code0);
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
const uint16_t* const __restrict code1FP16 =
reinterpret_cast<const uint16_t*>(code1);
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
SubIndexT::accum(
pqCoarseCentroids,
pqFineCentroids,
code0 + 2 * sizeof(uint16_t),
scaler0,
code1 + 2 * sizeof(uint16_t),
scaler1,
outputAccum);
minvAccum += minv0 + minv1;
}
// Process 2 samples.
// Each code uses its own fine pq centroids table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -178,7 +222,46 @@ struct IndexMinMaxFP16Decoder {
minvAccum += minv0 + minv1;
}
// Process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// * minvAccum += weight0 * minv0 + weight1 * minv1
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum,
float& minvAccum) {
const uint16_t* const __restrict code0FP16 =
reinterpret_cast<const uint16_t*>(code0);
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
const uint16_t* const __restrict code1FP16 =
reinterpret_cast<const uint16_t*>(code1);
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
SubIndexT::accum(
pqFineCentroids,
code0 + 2 * sizeof(uint16_t),
scaler0,
code1 + 2 * sizeof(uint16_t),
scaler1,
outputAccum);
minvAccum += minv0 + minv1;
}
// Process 3 samples.
// Each code uses its own coarse pq centroids table and fine pq centroids
// table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -232,7 +315,58 @@ struct IndexMinMaxFP16Decoder {
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples
// Process 3 samples.
// Coarse pq centroids table and fine pq centroids table are shared among
// codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// + weight2 * scaler2 * decoded(code2)
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
static void accum(
const float* const __restrict pqCoarseCentroids,
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum,
float& minvAccum) {
const uint16_t* const __restrict code0FP16 =
reinterpret_cast<const uint16_t*>(code0);
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
const uint16_t* const __restrict code1FP16 =
reinterpret_cast<const uint16_t*>(code1);
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
const uint16_t* const __restrict code2FP16 =
reinterpret_cast<const uint16_t*>(code2);
const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
SubIndexT::accum(
pqCoarseCentroids,
pqFineCentroids,
code0 + 2 * sizeof(uint16_t),
scaler0,
code1 + 2 * sizeof(uint16_t),
scaler1,
code2 + 2 * sizeof(uint16_t),
scaler2,
outputAccum);
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
@ -279,6 +413,52 @@ struct IndexMinMaxFP16Decoder {
minvAccum += minv0 + minv1 + minv2;
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
//
// Performs
// * outputAccum += weight0 * scaler0 * decoded(code0)
// + weight1 * scaler1 * decoded(code1)
// + weight2 * scaler2 * decoded(code2)
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum,
float& minvAccum) {
const uint16_t* const __restrict code0FP16 =
reinterpret_cast<const uint16_t*>(code0);
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
const uint16_t* const __restrict code1FP16 =
reinterpret_cast<const uint16_t*>(code1);
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
const uint16_t* const __restrict code2FP16 =
reinterpret_cast<const uint16_t*>(code2);
const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
SubIndexT::accum(
pqFineCentroids,
code0 + 2 * sizeof(uint16_t),
scaler0,
code1 + 2 * sizeof(uint16_t),
scaler1,
code2 + 2 * sizeof(uint16_t),
scaler2,
outputAccum);
minvAccum += minv0 + minv1 + minv2;
}
};
} // namespace cppcontrib

View File

@ -226,7 +226,8 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -274,7 +275,57 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -334,6 +385,67 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -410,7 +522,8 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -453,7 +566,52 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// clang-format off
// process chunks, 8 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -506,6 +664,60 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 8 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -582,7 +794,8 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -625,7 +838,52 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// clang-format off
// process chunks, 4 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids,
code0, weight0,
code1, weight1,
outputAccum);
// clang-format on
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -678,6 +936,59 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 4 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids, code0, weight0,
code1, weight1,
code2, weight2,
outputAccum);
// clang-format on
}
};
// This partial specialization is expected to do nothing.
@ -712,7 +1023,8 @@ struct IndexPQDecoderImpl<
const float weight0,
float* const __restrict outputAccum) {}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -722,7 +1034,18 @@ struct IndexPQDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -735,6 +1058,18 @@ struct IndexPQDecoderImpl<
const float weight2,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// clang-format on
};
@ -773,7 +1108,9 @@ struct IndexPQDecoder {
pqFineCentroids, code, weight, outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
@ -794,7 +1131,25 @@ struct IndexPQDecoder {
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
@ -820,6 +1175,31 @@ struct IndexPQDecoder {
weight2,
outputAccum);
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -76,7 +76,9 @@ struct IndexPQDecoder {
}
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
//
// Performs
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
static void accum(
@ -112,7 +114,46 @@ struct IndexPQDecoder {
}
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
//
// Performs
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const float* const __restrict finePtr0 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
}
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
@ -157,6 +198,52 @@ struct IndexPQDecoder {
weight2 * (*finePtr2);
}
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const intptr_t fineCode2 = fine2[fineCentroidIdx];
const float* const __restrict finePtr0 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict finePtr2 = pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
weight2 * (*finePtr2);
}
}
};
} // namespace cppcontrib

View File

@ -252,7 +252,8 @@ struct IndexPQDecoderImpl<
pqFineCentroids0, code0, weight0, outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -326,7 +327,76 @@ struct IndexPQDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -426,6 +496,104 @@ struct IndexPQDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
const intptr_t fineCode2a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine2);
const intptr_t fineCode2b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine2);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode2a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode2b) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -506,7 +674,8 @@ struct IndexPQDecoderImpl<
pqFineCentroids0, code0, weight0, outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -561,7 +730,57 @@ struct IndexPQDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// process chunks, 8 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -633,6 +852,76 @@ struct IndexPQDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 8 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -710,7 +999,8 @@ struct IndexPQDecoderImpl<
pqFineCentroids0, code0, weight0, outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -763,7 +1053,55 @@ struct IndexPQDecoderImpl<
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
// process chunks, 4 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -833,6 +1171,74 @@ struct IndexPQDecoderImpl<
weight2,
outputAccum);
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 4 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
// This partial specialization is expected to do nothing.
@ -865,7 +1271,8 @@ struct IndexPQDecoderImpl<
const float weight0,
float* const __restrict outputAccum) {}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -875,7 +1282,18 @@ struct IndexPQDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -887,6 +1305,18 @@ struct IndexPQDecoderImpl<
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
};
} // namespace
@ -923,7 +1353,9 @@ struct IndexPQDecoder {
pqFineCentroids, code, weight, outputAccum);
}
// process 2 samples
// Process 2 samples.
// Each code uses its own fine pq centroids table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
@ -944,7 +1376,25 @@ struct IndexPQDecoder {
outputAccum);
}
// process 3 samples
// Process 2 samples.
// Fine pq centroids table is shared among codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
}
// Process 3 samples.
// Each code uses its own fine pq centroids table.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
@ -970,6 +1420,31 @@ struct IndexPQDecoder {
weight2,
outputAccum);
}
// Process 3 samples.
// Fine pq centroids table is shared among codes.
//
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids,
const uint8_t* const __restrict code0,
const float weight0,
const uint8_t* const __restrict code1,
const float weight1,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids,
code0,
weight0,
code1,
weight1,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -162,11 +162,13 @@ void verifyIndex2LevelDecoder(
rng.seed(123);
std::vector<float> outputContrib2s(d, 0);
std::vector<float> outputContrib2sSame(d, 0);
for (size_t i = 0; i < n; i += 2) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib2s[j] = (j + 1) * (j + 1);
outputContrib2sSame[j] = (j + 1) * (j + 1);
}
// do a single step, 2 samples per step
@ -184,6 +186,16 @@ void verifyIndex2LevelDecoder(
weight1,
outputContrib2s.data());
// do a single step, 2 samples per step
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib2sSame.data());
// do two steps, 1 sample per step
T::accum(
pqCoarseCentroidsQ,
@ -201,6 +213,7 @@ void verifyIndex2LevelDecoder(
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
}
}
@ -208,12 +221,14 @@ void verifyIndex2LevelDecoder(
rng.seed(123);
std::vector<float> outputContrib3s(d, 0);
std::vector<float> outputContrib3sSame(d, 0);
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
outputContrib3sSame[j] = (j + 1) * (j + 1);
}
// do a single step, 3 samples per step
@ -236,6 +251,18 @@ void verifyIndex2LevelDecoder(
weight2,
outputContrib3s.data());
// do a single step, 3 samples per step
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3sSame.data());
// do three steps, 1 sample per step
T::accum(
pqCoarseCentroidsQ,
@ -259,6 +286,7 @@ void verifyIndex2LevelDecoder(
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
}
}
}
@ -358,15 +386,19 @@ void verifyMinMaxIndex2LevelDecoder(
rng.seed(123);
std::vector<float> outputContrib2s(d, 0);
std::vector<float> outputContrib2sSame(d, 0);
float outputMinv2s = 0;
float outputMinv2sSame = 0;
for (size_t i = 0; i < n; i += 2) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib2s[j] = (j + 1) * (j + 1);
outputContrib2sSame[j] = (j + 1) * (j + 1);
}
outputMinv1s = 0;
outputMinv2s = 0;
outputMinv2sSame = 0;
// do a single step, 2 samples per step
const float weight0 = u(rng);
@ -384,6 +416,17 @@ void verifyMinMaxIndex2LevelDecoder(
outputContrib2s.data(),
outputMinv2s);
// do a single step, 2 samples per step
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib2sSame.data(),
outputMinv2sSame);
// do two steps, 1 sample per step
T::accum(
pqCoarseCentroidsQ,
@ -405,6 +448,9 @@ void verifyMinMaxIndex2LevelDecoder(
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib2s[j] + outputMinv2s);
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib2sSame[j] + outputMinv2sSame);
}
}
@ -413,15 +459,19 @@ void verifyMinMaxIndex2LevelDecoder(
std::vector<float> outputContrib3s(d, 0);
float outputMinv3s = 0;
std::vector<float> outputContrib3sSame(d, 0);
float outputMinv3sSame = 0;
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
outputContrib3sSame[j] = (j + 1) * (j + 1);
}
outputMinv1s = 0;
outputMinv3s = 0;
outputMinv3sSame = 0;
// do a single step, 3 samples per step
const float weight0 = u(rng);
@ -444,6 +494,19 @@ void verifyMinMaxIndex2LevelDecoder(
outputContrib3s.data(),
outputMinv3s);
// do a single step, 3 samples per step
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3sSame.data(),
outputMinv3sSame);
// do three steps, 1 sample per step
T::accum(
pqCoarseCentroidsQ,
@ -472,6 +535,9 @@ void verifyMinMaxIndex2LevelDecoder(
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib3s[j] + outputMinv3s);
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib3sSame[j] + outputMinv3sSame);
}
}
}
@ -541,11 +607,13 @@ void verifyIndexPQDecoder(
rng.seed(123);
std::vector<float> outputContrib2s(d, 0);
std::vector<float> outputContrib2sSame(d, 0);
for (size_t i = 0; i < n; i += 2) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib2s[j] = (j + 1) * (j + 1);
outputContrib2sSame[j] = (j + 1) * (j + 1);
}
// do a single step, 2 samples per step
@ -561,6 +629,15 @@ void verifyIndexPQDecoder(
weight1,
outputContrib2s.data());
// do a single step, 2 samples per step
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib2sSame.data());
// do two steps, 1 sample per step
T::accum(
pqFineCentroidsQ,
@ -576,6 +653,7 @@ void verifyIndexPQDecoder(
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
}
}
@ -583,12 +661,14 @@ void verifyIndexPQDecoder(
rng.seed(123);
std::vector<float> outputContrib3s(d, 0);
std::vector<float> outputContrib3sSame(d, 0);
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
outputContrib3sSame[j] = (j + 1) * (j + 1);
}
// do a single step, 3 samples per step
@ -608,6 +688,17 @@ void verifyIndexPQDecoder(
weight2,
outputContrib3s.data());
// do a single step, 3 samples per step
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3sSame.data());
// do three steps, 1 sample per step
T::accum(
pqFineCentroidsQ,
@ -628,6 +719,7 @@ void verifyIndexPQDecoder(
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
}
}
}
@ -723,14 +815,18 @@ void verifyMinMaxIndexPQDecoder(
std::vector<float> outputContrib2s(d, 0);
float outputMinv2s = 0;
std::vector<float> outputContrib2sSame(d, 0);
float outputMinv2sSame = 0;
for (size_t i = 0; i < n; i += 2) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib2s[j] = (j + 1) * (j + 1);
outputContrib2sSame[j] = (j + 1) * (j + 1);
}
outputMinv1s = 0;
outputMinv2s = 0;
outputMinv2sSame = 0;
// do a single step, 2 samples per step
const float weight0 = u(rng);
@ -746,6 +842,16 @@ void verifyMinMaxIndexPQDecoder(
outputContrib2s.data(),
outputMinv2s);
// do a single step, 2 samples per step
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib2sSame.data(),
outputMinv2sSame);
// do two steps, 1 sample per step
T::accum(
pqFineCentroidsQ,
@ -765,6 +871,9 @@ void verifyMinMaxIndexPQDecoder(
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib2s[j] + outputMinv2s);
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib2sSame[j] + outputMinv2sSame);
}
}
@ -773,15 +882,19 @@ void verifyMinMaxIndexPQDecoder(
std::vector<float> outputContrib3s(d, 0);
float outputMinv3s = 0;
std::vector<float> outputContrib3sSame(d, 0);
float outputMinv3sSame = 0;
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
outputContrib3sSame[j] = (j + 1) * (j + 1);
}
outputMinv1s = 0;
outputMinv3s = 0;
outputMinv3sSame = 0;
// do a single step, 3 samples per step
const float weight0 = u(rng);
@ -801,6 +914,18 @@ void verifyMinMaxIndexPQDecoder(
outputContrib3s.data(),
outputMinv3s);
// do a single step, 3 samples per step
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
encodedData.data() + (i + 1) * codeSize,
weight1,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3sSame.data(),
outputMinv3sSame);
// do three steps, 1 sample per step
T::accum(
pqFineCentroidsQ,
@ -826,6 +951,9 @@ void verifyMinMaxIndexPQDecoder(
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib3s[j] + outputMinv3s);
ASSERT_FLOAT_EQ(
outputContrib1s[j] + outputMinv1s,
outputContrib3sSame[j] + outputMinv3sSame);
}
}
}