Additional C++ templates for fast sa_decode: add overloads for identical coarse and fine tables (#2458)
Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2458 Add overloads for ::accum() for the case of each code sharing coarse quantizer centroids table and fine quantizer centroids table Reviewed By: mdouze Differential Revision: D39314206 fbshipit-source-id: 170a0a1c434e00c95c98151e026d1e30ac017149pull/2461/head
parent
c740091662
commit
2cd84aa663
|
@ -114,7 +114,8 @@
|
|||
// superscalar architecture. Doing more vectors per call is less attractive
|
||||
// because of the possible lack of available CPU registers, but it is still
|
||||
// doable.
|
||||
// The method signature is the following:
|
||||
// If each code uses its own coarse quantizer centroids table and its own fine
|
||||
// quantizer centroids table, then the following overload can be used:
|
||||
// {
|
||||
// static void accum(
|
||||
// const float* const __restrict pqCoarseCentroids0,
|
||||
|
@ -127,9 +128,23 @@
|
|||
// const float weight1,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
// If codes share the coarse quantizer centroids table and also share
|
||||
// the fine quantizer centroids table, then the following overload can be
|
||||
// used:
|
||||
// {
|
||||
// static void accum(
|
||||
// const float* const __restrict pqCoarseCentroids,
|
||||
// const float* const __restrict pqFineCentroids,
|
||||
// const uint8_t* const __restrict code0,
|
||||
// const float weight0,
|
||||
// const uint8_t* const __restrict code1,
|
||||
// const float weight1,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
// * And one more overload for ::accum that decodes and accumulates
|
||||
// three vectors per call. Sometimes, it makes sense, at least for AVX2.
|
||||
// The method signature is the following:
|
||||
// If each code uses its own coarse quantizer centroids table and its own fine
|
||||
// quantizer centroids table, then the following overload can be used:
|
||||
// {
|
||||
// static void accum(
|
||||
// const float* const __restrict pqCoarseCentroids0,
|
||||
|
@ -146,6 +161,22 @@
|
|||
// const float weight2,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
// If codes share the coarse quantizer centroids table and also share
|
||||
// the fine quantizer centroids table, then the following overload can be
|
||||
// used:
|
||||
// {
|
||||
// static void accum(
|
||||
// const float* const __restrict pqCoarseCentroids,
|
||||
// const float* const __restrict pqFineCentroids,
|
||||
// const uint8_t* const __restrict code0,
|
||||
// const float weight0,
|
||||
// const uint8_t* const __restrict code1,
|
||||
// const float weight1,
|
||||
// const uint8_t* const __restrict code2,
|
||||
// const float weight2,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
//
|
||||
// The provided version is not multithreaded.
|
||||
//
|
||||
// Currently, an AVX2+FMA implementation is available. AVX512 version is also
|
||||
|
|
|
@ -277,7 +277,9 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -335,7 +337,68 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -409,6 +472,80 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
||||
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -520,7 +657,9 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -573,7 +712,63 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -640,6 +835,73 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -751,7 +1013,9 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -804,7 +1068,63 @@ struct Index2LevelDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -871,6 +1191,73 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
||||
pqCoarseCentroids, pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -911,7 +1298,8 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight0,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -923,7 +1311,19 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -939,6 +1339,19 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// clang-format on
|
||||
};
|
||||
} // namespace
|
||||
|
@ -1005,9 +1418,12 @@ struct Index2LevelDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
// decoded(code1).
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1036,7 +1452,40 @@ struct Index2LevelDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
|
@ -1074,6 +1523,40 @@ struct Index2LevelDecoder {
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -122,9 +122,12 @@ struct Index2LevelDecoder {
|
|||
}
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Performs
|
||||
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1).
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -181,10 +184,72 @@ struct Index2LevelDecoder {
|
|||
}
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs
|
||||
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
|
||||
// + weight2 * decoded(code2)
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const coarse_storage_type* const __restrict coarse0 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code0);
|
||||
const coarse_storage_type* const __restrict coarse1 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code1);
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 =
|
||||
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
const uint8_t* const __restrict fine1 =
|
||||
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
|
||||
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr0 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict coarsePtr1 = pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
|
||||
weight1 * (*coarsePtr1 + *finePtr1);
|
||||
}
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -259,6 +324,83 @@ struct Index2LevelDecoder {
|
|||
weight2 * (*coarsePtr2 + *finePtr2);
|
||||
}
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const coarse_storage_type* const __restrict coarse0 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code0);
|
||||
const coarse_storage_type* const __restrict coarse1 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code1);
|
||||
const coarse_storage_type* const __restrict coarse2 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code2);
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 =
|
||||
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
const uint8_t* const __restrict fine1 =
|
||||
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
const uint8_t* const __restrict fine2 =
|
||||
code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
|
||||
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
|
||||
const intptr_t fineCode2 = fine2[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict coarsePtr0 = pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr0 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict coarsePtr1 = pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict coarsePtr2 = pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr2 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
|
||||
weight1 * (*coarsePtr1 + *finePtr1) +
|
||||
weight2 * (*coarsePtr2 + *finePtr2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -329,7 +329,9 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -431,7 +433,109 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -569,6 +673,139 @@ struct Index2LevelDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine2);
|
||||
const intptr_t fineCode2b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine2);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode2a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode2b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -709,7 +946,9 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -792,7 +1031,90 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -902,6 +1224,111 @@ struct Index2LevelDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -1039,7 +1466,9 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1120,7 +1549,88 @@ struct Index2LevelDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 4>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1228,6 +1738,109 @@ struct Index2LevelDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 4>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -1266,7 +1879,9 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight0,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1278,7 +1893,21 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1293,6 +1922,20 @@ struct Index2LevelDecoderImpl<
|
|||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -1358,9 +2001,12 @@ struct Index2LevelDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
// decoded(code1).
|
||||
//
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1389,9 +2035,42 @@ struct Index2LevelDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
//
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
//
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
|
@ -1427,6 +2106,40 @@ struct Index2LevelDecoder {
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
//
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -97,7 +97,10 @@ struct IndexMinMaxDecoder {
|
|||
minvAccum += minv;
|
||||
}
|
||||
|
||||
// Process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -137,7 +140,48 @@ struct IndexMinMaxDecoder {
|
|||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 2 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const float* const __restrict code0Float =
|
||||
reinterpret_cast<const float*>(code0);
|
||||
const float scaler0 = code0Float[0] * weight0;
|
||||
const float minv0 = code0Float[1] * weight0;
|
||||
|
||||
const float* const __restrict code1Float =
|
||||
reinterpret_cast<const float*>(code1);
|
||||
const float scaler1 = code1Float[0] * weight1;
|
||||
const float minv1 = code1Float[1] * weight1;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(float),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(float),
|
||||
scaler1,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -173,7 +217,46 @@ struct IndexMinMaxDecoder {
|
|||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const float* const __restrict code0Float =
|
||||
reinterpret_cast<const float*>(code0);
|
||||
const float scaler0 = code0Float[0] * weight0;
|
||||
const float minv0 = code0Float[1] * weight0;
|
||||
|
||||
const float* const __restrict code1Float =
|
||||
reinterpret_cast<const float*>(code1);
|
||||
const float scaler1 = code1Float[0] * weight1;
|
||||
const float minv1 = code1Float[1] * weight1;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(float),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(float),
|
||||
scaler1,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -227,7 +310,58 @@ struct IndexMinMaxDecoder {
|
|||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// + weight2 * scaler2 * decoded(code2)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const float* const __restrict code0Float =
|
||||
reinterpret_cast<const float*>(code0);
|
||||
const float scaler0 = code0Float[0] * weight0;
|
||||
const float minv0 = code0Float[1] * weight0;
|
||||
|
||||
const float* const __restrict code1Float =
|
||||
reinterpret_cast<const float*>(code1);
|
||||
const float scaler1 = code1Float[0] * weight1;
|
||||
const float minv1 = code1Float[1] * weight1;
|
||||
|
||||
const float* const __restrict code2Float =
|
||||
reinterpret_cast<const float*>(code2);
|
||||
const float scaler2 = code2Float[0] * weight2;
|
||||
const float minv2 = code2Float[1] * weight2;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(float),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(float),
|
||||
scaler1,
|
||||
code2 + 2 * sizeof(float),
|
||||
scaler2,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -274,6 +408,52 @@ struct IndexMinMaxDecoder {
|
|||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// + weight2 * scaler2 * decoded(code2)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const float* const __restrict code0Float =
|
||||
reinterpret_cast<const float*>(code0);
|
||||
const float scaler0 = code0Float[0] * weight0;
|
||||
const float minv0 = code0Float[1] * weight0;
|
||||
|
||||
const float* const __restrict code1Float =
|
||||
reinterpret_cast<const float*>(code1);
|
||||
const float scaler1 = code1Float[0] * weight1;
|
||||
const float minv1 = code1Float[1] * weight1;
|
||||
|
||||
const float* const __restrict code2Float =
|
||||
reinterpret_cast<const float*>(code2);
|
||||
const float scaler2 = code2Float[0] * weight2;
|
||||
const float minv2 = code2Float[1] * weight2;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(float),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(float),
|
||||
scaler1,
|
||||
code2 + 2 * sizeof(float),
|
||||
scaler2,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -102,7 +102,10 @@ struct IndexMinMaxFP16Decoder {
|
|||
minvAccum += minv;
|
||||
}
|
||||
|
||||
// Process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -142,7 +145,48 @@ struct IndexMinMaxFP16Decoder {
|
|||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 2 samples
|
||||
// Process 2 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const uint16_t* const __restrict code0FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code0);
|
||||
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
|
||||
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
|
||||
|
||||
const uint16_t* const __restrict code1FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code1);
|
||||
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
|
||||
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(uint16_t),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(uint16_t),
|
||||
scaler1,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -178,7 +222,46 @@ struct IndexMinMaxFP16Decoder {
|
|||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const uint16_t* const __restrict code0FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code0);
|
||||
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
|
||||
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
|
||||
|
||||
const uint16_t* const __restrict code1FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code1);
|
||||
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
|
||||
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(uint16_t),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(uint16_t),
|
||||
scaler1,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own coarse pq centroids table and fine pq centroids
|
||||
// table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -232,7 +315,58 @@ struct IndexMinMaxFP16Decoder {
|
|||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples
|
||||
// Process 3 samples.
|
||||
// Coarse pq centroids table and fine pq centroids table are shared among
|
||||
// codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// + weight2 * scaler2 * decoded(code2)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids,
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const uint16_t* const __restrict code0FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code0);
|
||||
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
|
||||
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
|
||||
|
||||
const uint16_t* const __restrict code1FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code1);
|
||||
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
|
||||
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
|
||||
|
||||
const uint16_t* const __restrict code2FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code2);
|
||||
const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
|
||||
const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqCoarseCentroids,
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(uint16_t),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(uint16_t),
|
||||
scaler1,
|
||||
code2 + 2 * sizeof(uint16_t),
|
||||
scaler2,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
|
@ -279,6 +413,52 @@ struct IndexMinMaxFP16Decoder {
|
|||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs
|
||||
// * outputAccum += weight0 * scaler0 * decoded(code0)
|
||||
// + weight1 * scaler1 * decoded(code1)
|
||||
// + weight2 * scaler2 * decoded(code2)
|
||||
// * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum,
|
||||
float& minvAccum) {
|
||||
const uint16_t* const __restrict code0FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code0);
|
||||
const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
|
||||
const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
|
||||
|
||||
const uint16_t* const __restrict code1FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code1);
|
||||
const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
|
||||
const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
|
||||
|
||||
const uint16_t* const __restrict code2FP16 =
|
||||
reinterpret_cast<const uint16_t*>(code2);
|
||||
const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
|
||||
const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
|
||||
|
||||
SubIndexT::accum(
|
||||
pqFineCentroids,
|
||||
code0 + 2 * sizeof(uint16_t),
|
||||
scaler0,
|
||||
code1 + 2 * sizeof(uint16_t),
|
||||
scaler1,
|
||||
code2 + 2 * sizeof(uint16_t),
|
||||
scaler2,
|
||||
outputAccum);
|
||||
|
||||
minvAccum += minv0 + minv1 + minv2;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -226,7 +226,8 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -274,7 +275,57 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -334,6 +385,67 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
||||
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -410,7 +522,8 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -453,7 +566,52 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -506,6 +664,60 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -582,7 +794,8 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -625,7 +838,52 @@ struct IndexPQDecoderImpl<
|
|||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids,
|
||||
code0, weight0,
|
||||
code1, weight1,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -678,6 +936,59 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids, code0, weight0,
|
||||
code1, weight1,
|
||||
code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -712,7 +1023,8 @@ struct IndexPQDecoderImpl<
|
|||
const float weight0,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -722,7 +1034,18 @@ struct IndexPQDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -735,6 +1058,18 @@ struct IndexPQDecoderImpl<
|
|||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
|
@ -773,7 +1108,9 @@ struct IndexPQDecoder {
|
|||
pqFineCentroids, code, weight, outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
|
@ -794,7 +1131,25 @@ struct IndexPQDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
|
@ -820,6 +1175,31 @@ struct IndexPQDecoder {
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -76,7 +76,9 @@ struct IndexPQDecoder {
|
|||
}
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs
|
||||
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
|
||||
static void accum(
|
||||
|
@ -112,7 +114,46 @@ struct IndexPQDecoder {
|
|||
}
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs
|
||||
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict finePtr0 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
|
||||
}
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
|
@ -157,6 +198,52 @@ struct IndexPQDecoder {
|
|||
weight2 * (*finePtr2);
|
||||
}
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
const intptr_t fineCode2 = fine2[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict finePtr0 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict finePtr2 = pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
|
||||
weight2 * (*finePtr2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -252,7 +252,8 @@ struct IndexPQDecoderImpl<
|
|||
pqFineCentroids0, code0, weight0, outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -326,7 +327,76 @@ struct IndexPQDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -426,6 +496,104 @@ struct IndexPQDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode2a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine2);
|
||||
const intptr_t fineCode2b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine2);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode2a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode2b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -506,7 +674,8 @@ struct IndexPQDecoderImpl<
|
|||
pqFineCentroids0, code0, weight0, outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -561,7 +730,57 @@ struct IndexPQDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -633,6 +852,76 @@ struct IndexPQDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -710,7 +999,8 @@ struct IndexPQDecoderImpl<
|
|||
pqFineCentroids0, code0, weight0, outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -763,7 +1053,55 @@ struct IndexPQDecoderImpl<
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -833,6 +1171,74 @@ struct IndexPQDecoderImpl<
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -865,7 +1271,8 @@ struct IndexPQDecoderImpl<
|
|||
const float weight0,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -875,7 +1282,18 @@ struct IndexPQDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -887,6 +1305,18 @@ struct IndexPQDecoderImpl<
|
|||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -923,7 +1353,9 @@ struct IndexPQDecoder {
|
|||
pqFineCentroids, code, weight, outputAccum);
|
||||
}
|
||||
|
||||
// process 2 samples
|
||||
// Process 2 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
|
@ -944,7 +1376,25 @@ struct IndexPQDecoder {
|
|||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Process 2 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Each code uses its own fine pq centroids table.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
|
@ -970,6 +1420,31 @@ struct IndexPQDecoder {
|
|||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// Process 3 samples.
|
||||
// Fine pq centroids table is shared among codes.
|
||||
//
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids,
|
||||
code0,
|
||||
weight0,
|
||||
code1,
|
||||
weight1,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -162,11 +162,13 @@ void verifyIndex2LevelDecoder(
|
|||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib2s(d, 0);
|
||||
std::vector<float> outputContrib2sSame(d, 0);
|
||||
for (size_t i = 0; i < n; i += 2) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
|
@ -184,6 +186,16 @@ void verifyIndex2LevelDecoder(
|
|||
weight1,
|
||||
outputContrib2s.data());
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib2sSame.data());
|
||||
|
||||
// do two steps, 1 sample per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
|
@ -201,6 +213,7 @@ void verifyIndex2LevelDecoder(
|
|||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -208,12 +221,14 @@ void verifyIndex2LevelDecoder(
|
|||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
std::vector<float> outputContrib3sSame(d, 0);
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
|
@ -236,6 +251,18 @@ void verifyIndex2LevelDecoder(
|
|||
weight2,
|
||||
outputContrib3s.data());
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3sSame.data());
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
|
@ -259,6 +286,7 @@ void verifyIndex2LevelDecoder(
|
|||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -358,15 +386,19 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib2s(d, 0);
|
||||
std::vector<float> outputContrib2sSame(d, 0);
|
||||
float outputMinv2s = 0;
|
||||
float outputMinv2sSame = 0;
|
||||
for (size_t i = 0; i < n; i += 2) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
outputMinv1s = 0;
|
||||
outputMinv2s = 0;
|
||||
outputMinv2sSame = 0;
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
const float weight0 = u(rng);
|
||||
|
@ -384,6 +416,17 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
outputContrib2s.data(),
|
||||
outputMinv2s);
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib2sSame.data(),
|
||||
outputMinv2sSame);
|
||||
|
||||
// do two steps, 1 sample per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
|
@ -405,6 +448,9 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib2s[j] + outputMinv2s);
|
||||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib2sSame[j] + outputMinv2sSame);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -413,15 +459,19 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
float outputMinv3s = 0;
|
||||
std::vector<float> outputContrib3sSame(d, 0);
|
||||
float outputMinv3sSame = 0;
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
outputMinv1s = 0;
|
||||
outputMinv3s = 0;
|
||||
outputMinv3sSame = 0;
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
const float weight0 = u(rng);
|
||||
|
@ -444,6 +494,19 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
outputContrib3s.data(),
|
||||
outputMinv3s);
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3sSame.data(),
|
||||
outputMinv3sSame);
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
|
@ -472,6 +535,9 @@ void verifyMinMaxIndex2LevelDecoder(
|
|||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib3s[j] + outputMinv3s);
|
||||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib3sSame[j] + outputMinv3sSame);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -541,11 +607,13 @@ void verifyIndexPQDecoder(
|
|||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib2s(d, 0);
|
||||
std::vector<float> outputContrib2sSame(d, 0);
|
||||
for (size_t i = 0; i < n; i += 2) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
|
@ -561,6 +629,15 @@ void verifyIndexPQDecoder(
|
|||
weight1,
|
||||
outputContrib2s.data());
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib2sSame.data());
|
||||
|
||||
// do two steps, 1 sample per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
|
@ -576,6 +653,7 @@ void verifyIndexPQDecoder(
|
|||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -583,12 +661,14 @@ void verifyIndexPQDecoder(
|
|||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
std::vector<float> outputContrib3sSame(d, 0);
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
|
@ -608,6 +688,17 @@ void verifyIndexPQDecoder(
|
|||
weight2,
|
||||
outputContrib3s.data());
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3sSame.data());
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
|
@ -628,6 +719,7 @@ void verifyIndexPQDecoder(
|
|||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -723,14 +815,18 @@ void verifyMinMaxIndexPQDecoder(
|
|||
|
||||
std::vector<float> outputContrib2s(d, 0);
|
||||
float outputMinv2s = 0;
|
||||
std::vector<float> outputContrib2sSame(d, 0);
|
||||
float outputMinv2sSame = 0;
|
||||
for (size_t i = 0; i < n; i += 2) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2s[j] = (j + 1) * (j + 1);
|
||||
outputContrib2sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
outputMinv1s = 0;
|
||||
outputMinv2s = 0;
|
||||
outputMinv2sSame = 0;
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
const float weight0 = u(rng);
|
||||
|
@ -746,6 +842,16 @@ void verifyMinMaxIndexPQDecoder(
|
|||
outputContrib2s.data(),
|
||||
outputMinv2s);
|
||||
|
||||
// do a single step, 2 samples per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib2sSame.data(),
|
||||
outputMinv2sSame);
|
||||
|
||||
// do two steps, 1 sample per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
|
@ -765,6 +871,9 @@ void verifyMinMaxIndexPQDecoder(
|
|||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib2s[j] + outputMinv2s);
|
||||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib2sSame[j] + outputMinv2sSame);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -773,15 +882,19 @@ void verifyMinMaxIndexPQDecoder(
|
|||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
float outputMinv3s = 0;
|
||||
std::vector<float> outputContrib3sSame(d, 0);
|
||||
float outputMinv3sSame = 0;
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3sSame[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
outputMinv1s = 0;
|
||||
outputMinv3s = 0;
|
||||
outputMinv3sSame = 0;
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
const float weight0 = u(rng);
|
||||
|
@ -801,6 +914,18 @@ void verifyMinMaxIndexPQDecoder(
|
|||
outputContrib3s.data(),
|
||||
outputMinv3s);
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3sSame.data(),
|
||||
outputMinv3sSame);
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
|
@ -826,6 +951,9 @@ void verifyMinMaxIndexPQDecoder(
|
|||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib3s[j] + outputMinv3s);
|
||||
ASSERT_FLOAT_EQ(
|
||||
outputContrib1s[j] + outputMinv1s,
|
||||
outputContrib3sSame[j] + outputMinv3sSame);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue