Additional C++ templates for fast sa_decode: additional overload for ::accum() (#2445)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2445

Add overloads for ::accum() to process 3 vectors per call. It is faster than processing 2 vectors per call in certain cases, at least for the AVX2 code.

Reviewed By: mdouze

Differential Revision: D39176425

fbshipit-source-id: bb39bb1f7a77442d32f20cb29281ec2e2ed2600c
pull/2456/head
Alexandr Guzhva 2022-09-05 10:04:32 -07:00 committed by Facebook GitHub Bot
parent abb46aceae
commit b4924aad9c
8 changed files with 1429 additions and 1 deletions

View File

@ -127,6 +127,25 @@
// const float weight1,
// float* const __restrict outputAccum);
// }
// * And one more overload for ::accum that decodes and accumulates
// three vectors per call. Sometimes, it makes sense, at least for AVX2.
// The method signature is the following:
// {
// static void accum(
// const float* const __restrict pqCoarseCentroids0,
// const float* const __restrict pqFineCentroids0,
// const uint8_t* const __restrict code0,
// const float weight0,
// const float* const __restrict pqCoarseCentroids1,
// const float* const __restrict pqFineCentroids1,
// const uint8_t* const __restrict code1,
// const float weight1,
// const float* const __restrict pqCoarseCentroids2,
// const float* const __restrict pqFineCentroids2,
// const uint8_t* const __restrict code2,
// const float weight2,
// float* const __restrict outputAccum);
// }
// The provided version is not multithreaded.
//
// Currently, an AVX2+FMA implementation is available. AVX512 version is also

View File

@ -334,6 +334,81 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
template <
@ -497,6 +572,74 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 8 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
template <
@ -660,6 +803,74 @@ struct Index2LevelDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// clang-format off
// process chunks, 4 float
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
// This partial specialization is expected to do nothing.
@ -712,6 +923,22 @@ struct Index2LevelDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// clang-format on
};
} // namespace
@ -802,6 +1029,45 @@ struct Index2LevelDecoder {
weight1,
outputAccum);
}
// process 3 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids0,
pqFineCentroids0,
code0,
weight0,
pqCoarseCentroids1,
pqFineCentroids1,
code1,
weight1,
pqCoarseCentroids2,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -174,6 +174,85 @@ struct Index2LevelDecoder {
weight1 * (*coarsePtr1 + *finePtr1);
}
}
// process 3 samples
// Performs
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
// + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const coarse_storage_type* const __restrict coarse0 =
reinterpret_cast<const coarse_storage_type*>(code0);
const coarse_storage_type* const __restrict coarse1 =
reinterpret_cast<const coarse_storage_type*>(code1);
const coarse_storage_type* const __restrict coarse2 =
reinterpret_cast<const coarse_storage_type*>(code2);
// fine quantizer
const uint8_t* const __restrict fine0 =
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
const uint8_t* const __restrict fine1 =
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
const uint8_t* const __restrict fine2 =
code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
const intptr_t fineCode2 = fine2[fineCentroidIdx];
const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr0 = pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict coarsePtr2 = pqCoarseCentroids2 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset;
const float* const __restrict finePtr2 = pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
weight1 * (*coarsePtr1 + *finePtr1) +
weight2 * (*coarsePtr2 + *finePtr2);
}
}
};
} // namespace cppcontrib

View File

@ -430,6 +430,145 @@ struct Index2LevelDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine2);
const intptr_t fineCode2b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine2);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids0 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids0 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids0 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids1 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids1 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids1 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqCoarseCentroids2 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids2 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode2a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids2 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode2b) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids0,
pqFineCentroids0,
code0,
weight0,
pqCoarseCentroids1,
pqFineCentroids1,
code1,
weight1,
pqCoarseCentroids2,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
template <
@ -652,6 +791,117 @@ struct Index2LevelDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 8 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids0 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids1 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqCoarseCentroids2 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 8>::
accum(pqCoarseCentroids0,
pqFineCentroids0,
code0,
weight0,
pqCoarseCentroids1,
pqFineCentroids1,
code1,
weight1,
pqCoarseCentroids2,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
template <
@ -869,6 +1119,115 @@ struct Index2LevelDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// coarse quantizer
const uint8_t* const __restrict coarse0 = code0;
const uint8_t* const __restrict coarse1 = code1;
const uint8_t* const __restrict coarse2 = code2;
// fine quantizer
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
// process chunks, 4 float
const intptr_t coarseCode0 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse0);
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t coarseCode1 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse1);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t coarseCode2 = detail::
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
get(coarse2);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids0 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids1 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqCoarseCentroids2 +
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
COARSE_SIZE +
coarseCentroidOffset,
pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
CPOS + 4>::
accum(pqCoarseCentroids0,
pqFineCentroids0,
code0,
weight0,
pqCoarseCentroids1,
pqFineCentroids1,
code1,
weight1,
pqCoarseCentroids2,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
// This partial specialization is expected to do nothing.
@ -918,6 +1277,22 @@ struct Index2LevelDecoderImpl<
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
};
} // namespace
@ -1007,6 +1382,45 @@ struct Index2LevelDecoder {
weight1,
outputAccum);
}
// process 3 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqCoarseCentroids0,
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqCoarseCentroids1,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqCoarseCentroids2,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
Index2LevelDecoderImpl<
DIM,
COARSE_SIZE,
FINE_SIZE,
COARSE_BITS,
FINE_BITS,
0>::
accum(pqCoarseCentroids0,
pqFineCentroids0,
code0,
weight0,
pqCoarseCentroids1,
pqFineCentroids1,
code1,
weight1,
pqCoarseCentroids2,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -273,6 +273,67 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids0, code0, weight0,
pqFineCentroids1, code1, weight1,
pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -391,6 +452,60 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 8 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids0, code0, weight0,
pqFineCentroids1, code1, weight1,
pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -509,6 +624,60 @@ struct IndexPQDecoderImpl<
// clang-format on
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// clang-format off
// process chunks, 4 float
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
weight2,
existingValue);
_mm_storeu_ps(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids0, code0, weight0,
pqFineCentroids1, code1, weight1,
pqFineCentroids2, code2, weight2,
outputAccum);
// clang-format on
}
};
// This partial specialization is expected to do nothing.
@ -553,6 +722,19 @@ struct IndexPQDecoderImpl<
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
// clang-format on
};
@ -607,6 +789,33 @@ struct IndexPQDecoder {
weight1,
outputAccum);
}
// process 3 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids0,
code0,
weight0,
pqFineCentroids1,
code1,
weight1,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -107,6 +107,52 @@ struct IndexPQDecoder {
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
}
}
// process 3 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
#pragma unroll
for (intptr_t i = 0; i < DIM; i++) {
const intptr_t fineCentroidIdx = i / FINE_SIZE;
const intptr_t fineCentroidOffset = i % FINE_SIZE;
const intptr_t fineCode0 = fine0[fineCentroidIdx];
const intptr_t fineCode1 = fine1[fineCentroidIdx];
const intptr_t fineCode2 = fine2[fineCentroidIdx];
const float* const __restrict finePtr0 = pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict finePtr1 = pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset;
const float* const __restrict finePtr2 = pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset;
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
weight2 * (*finePtr2);
}
}
};
} // namespace cppcontrib

View File

@ -325,6 +325,107 @@ struct IndexPQDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 4 float
// but 8 floats per loop
const intptr_t fineCode0a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine0);
const intptr_t fineCode0b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine0);
const intptr_t fineCode1a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine1);
const intptr_t fineCode1b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine1);
const intptr_t fineCode2a = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
fine2);
const intptr_t fineCode2b = detail::
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
fine2);
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock4x2bAccum(
pqFineCentroids0 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode0a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids0 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode0b) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids1 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode1a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids1 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode1b) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x2bAccum(
pqFineCentroids2 +
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
fineCode2a) *
FINE_SIZE +
fineCentroidOffset,
pqFineCentroids2 +
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
fineCode2b) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids0,
code0,
weight0,
pqFineCentroids1,
code1,
weight1,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -459,6 +560,79 @@ struct IndexPQDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 8 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
auto existingValue = elementaryBlock8x1bAccum(
pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
{existingValue0, existingValue1});
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock8x1bAccum(
pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
pqFineCentroids0,
code0,
weight0,
pqFineCentroids1,
code1,
weight1,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
@ -588,6 +762,77 @@ struct IndexPQDecoderImpl<
weight1,
outputAccum);
}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
// fine quantizer
const uint8_t* const __restrict fine0 = code0;
const uint8_t* const __restrict fine1 = code1;
const uint8_t* const __restrict fine2 = code2;
// process chunks, 4 float
const intptr_t fineCode0 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine0);
const intptr_t fineCode1 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine1);
const intptr_t fineCode2 =
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
get(fine2);
auto existingValue = vld1q_f32(outputAccum + CPOS);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids0 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
FINE_SIZE +
fineCentroidOffset,
weight0,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids1 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
FINE_SIZE +
fineCentroidOffset,
weight1,
existingValue);
existingValue = elementaryBlock4x1bAccum(
pqFineCentroids2 +
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
FINE_SIZE +
fineCentroidOffset,
weight2,
existingValue);
vst1q_f32(outputAccum + CPOS, existingValue);
// next
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
pqFineCentroids0,
code0,
weight0,
pqFineCentroids1,
code1,
weight1,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
// This partial specialization is expected to do nothing.
@ -629,6 +874,19 @@ struct IndexPQDecoderImpl<
const uint8_t* const __restrict code1,
const float weight1,
float* const __restrict outputAccum) {}
// process 3 samples
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {}
};
} // namespace
@ -663,7 +921,7 @@ struct IndexPQDecoder {
// process 2 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1)
// decoded(code1)
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
@ -681,6 +939,33 @@ struct IndexPQDecoder {
weight1,
outputAccum);
}
// process 3 samples
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
// decoded(code1) + weight2 * decoded(code2)
static void accum(
const float* const __restrict pqFineCentroids0,
const uint8_t* const __restrict code0,
const float weight0,
const float* const __restrict pqFineCentroids1,
const uint8_t* const __restrict code1,
const float weight1,
const float* const __restrict pqFineCentroids2,
const uint8_t* const __restrict code2,
const float weight2,
float* const __restrict outputAccum) {
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
pqFineCentroids0,
code0,
weight0,
pqFineCentroids1,
code1,
weight1,
pqFineCentroids2,
code2,
weight2,
outputAccum);
}
};
} // namespace cppcontrib

View File

@ -201,6 +201,64 @@ void verifyIndex2LevelDecoder(
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
}
}
// test contrib::accum, 3 samples per iteration.
rng.seed(123);
std::vector<float> outputContrib3s(d, 0);
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
}
// do a single step, 3 samples per step
const float weight0 = u(rng);
const float weight1 = u(rng);
const float weight2 = u(rng);
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 1) * codeSize,
weight1,
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3s.data());
// do three steps, 1 sample per step
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
outputContrib1s.data());
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib1s.data());
T::accum(
pqCoarseCentroidsQ,
pqFineCentroidsQ,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib1s.data());
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
}
}
}
template <typename T>
@ -304,6 +362,58 @@ void verifyIndexPQDecoder(
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
}
}
// test contrib::accum, 3 samples per iteration.
rng.seed(123);
std::vector<float> outputContrib3s(d, 0);
const size_t n3 = (n / 3) * 3;
for (size_t i = 0; i < n3; i += 3) {
// populate outputContribs with some existing data
for (size_t j = 0; j < d; j++) {
outputContrib1s[j] = (j + 1) * (j + 1);
outputContrib3s[j] = (j + 1) * (j + 1);
}
// do a single step, 3 samples per step
const float weight0 = u(rng);
const float weight1 = u(rng);
const float weight2 = u(rng);
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
pqFineCentroidsQ,
encodedData.data() + (i + 1) * codeSize,
weight1,
pqFineCentroidsQ,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib3s.data());
// do three steps, 1 sample per step
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 0) * codeSize,
weight0,
outputContrib1s.data());
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 1) * codeSize,
weight1,
outputContrib1s.data());
T::accum(
pqFineCentroidsQ,
encodedData.data() + (i + 2) * codeSize,
weight2,
outputContrib1s.data());
// compare
for (size_t j = 0; j < d; j++) {
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
}
}
}
std::vector<float> generate(const size_t n, const size_t d) {