Additional C++ templates for fast sa_decode: additional overload for ::accum() (#2445)
Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/2445 Add overloads for ::accum() to process 3 vectors per call. It is faster than processing 2 vectors per call in certain cases, at least for the AVX2 code. Reviewed By: mdouze Differential Revision: D39176425 fbshipit-source-id: bb39bb1f7a77442d32f20cb29281ec2e2ed2600cpull/2456/head
parent
abb46aceae
commit
b4924aad9c
|
@ -127,6 +127,25 @@
|
|||
// const float weight1,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
// * And one more overload for ::accum that decodes and accumulates
|
||||
// three vectors per call. Sometimes, it makes sense, at least for AVX2.
|
||||
// The method signature is the following:
|
||||
// {
|
||||
// static void accum(
|
||||
// const float* const __restrict pqCoarseCentroids0,
|
||||
// const float* const __restrict pqFineCentroids0,
|
||||
// const uint8_t* const __restrict code0,
|
||||
// const float weight0,
|
||||
// const float* const __restrict pqCoarseCentroids1,
|
||||
// const float* const __restrict pqFineCentroids1,
|
||||
// const uint8_t* const __restrict code1,
|
||||
// const float weight1,
|
||||
// const float* const __restrict pqCoarseCentroids2,
|
||||
// const float* const __restrict pqFineCentroids2,
|
||||
// const uint8_t* const __restrict code2,
|
||||
// const float weight2,
|
||||
// float* const __restrict outputAccum);
|
||||
// }
|
||||
// The provided version is not multithreaded.
|
||||
//
|
||||
// Currently, an AVX2+FMA implementation is available. AVX512 version is also
|
||||
|
|
|
@ -334,6 +334,81 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
||||
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
||||
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
||||
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -497,6 +572,74 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
|
||||
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
||||
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
||||
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -660,6 +803,74 @@ struct Index2LevelDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
|
||||
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
|
||||
pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
|
||||
pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
|
||||
pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -712,6 +923,22 @@ struct Index2LevelDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// clang-format on
|
||||
};
|
||||
} // namespace
|
||||
|
@ -802,6 +1029,45 @@ struct Index2LevelDecoder {
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids0,
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqCoarseCentroids1,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqCoarseCentroids2,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -174,6 +174,85 @@ struct Index2LevelDecoder {
|
|||
weight1 * (*coarsePtr1 + *finePtr1);
|
||||
}
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs
|
||||
// outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
|
||||
// + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const coarse_storage_type* const __restrict coarse0 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code0);
|
||||
const coarse_storage_type* const __restrict coarse1 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code1);
|
||||
const coarse_storage_type* const __restrict coarse2 =
|
||||
reinterpret_cast<const coarse_storage_type*>(code2);
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 =
|
||||
code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
const uint8_t* const __restrict fine1 =
|
||||
code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
const uint8_t* const __restrict fine2 =
|
||||
code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
|
||||
const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
|
||||
const intptr_t fineCode2 = fine2[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr0 = pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict coarsePtr2 = pqCoarseCentroids2 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset;
|
||||
const float* const __restrict finePtr2 = pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
|
||||
weight1 * (*coarsePtr1 + *finePtr1) +
|
||||
weight2 * (*coarsePtr2 + *finePtr2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -430,6 +430,145 @@ struct Index2LevelDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine2);
|
||||
const intptr_t fineCode2b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine2);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids0 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids0 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids0 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids1 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids1 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids1 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqCoarseCentroids2 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids2 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode2a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids2 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode2b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids0,
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqCoarseCentroids1,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqCoarseCentroids2,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -652,6 +791,117 @@ struct Index2LevelDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids0 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids1 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqCoarseCentroids2 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 8>::
|
||||
accum(pqCoarseCentroids0,
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqCoarseCentroids1,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqCoarseCentroids2,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <
|
||||
|
@ -869,6 +1119,115 @@ struct Index2LevelDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// coarse quantizer
|
||||
const uint8_t* const __restrict coarse0 = code0;
|
||||
const uint8_t* const __restrict coarse1 = code1;
|
||||
const uint8_t* const __restrict coarse2 = code2;
|
||||
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
|
||||
const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t coarseCode0 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse0);
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t coarseCode1 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse1);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t coarseCode2 = detail::
|
||||
UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
|
||||
get(coarse2);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids0 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids1 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqCoarseCentroids2 +
|
||||
(coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
|
||||
COARSE_SIZE +
|
||||
coarseCentroidOffset,
|
||||
pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
CPOS + 4>::
|
||||
accum(pqCoarseCentroids0,
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqCoarseCentroids1,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqCoarseCentroids2,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -918,6 +1277,22 @@ struct Index2LevelDecoderImpl<
|
|||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -1007,6 +1382,45 @@ struct Index2LevelDecoder {
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqCoarseCentroids0,
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqCoarseCentroids1,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqCoarseCentroids2,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
Index2LevelDecoderImpl<
|
||||
DIM,
|
||||
COARSE_SIZE,
|
||||
FINE_SIZE,
|
||||
COARSE_BITS,
|
||||
FINE_BITS,
|
||||
0>::
|
||||
accum(pqCoarseCentroids0,
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqCoarseCentroids1,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqCoarseCentroids2,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -273,6 +273,67 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
|
||||
const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
|
||||
const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
|
||||
const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
|
||||
const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
|
||||
const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
|
||||
pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids0, code0, weight0,
|
||||
pqFineCentroids1, code1, weight1,
|
||||
pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -391,6 +452,60 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm256_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids0, code0, weight0,
|
||||
pqFineCentroids1, code1, weight1,
|
||||
pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -509,6 +624,60 @@ struct IndexPQDecoderImpl<
|
|||
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// clang-format off
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
|
||||
const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
|
||||
const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
|
||||
|
||||
__m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
_mm_storeu_ps(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids0, code0, weight0,
|
||||
pqFineCentroids1, code1, weight1,
|
||||
pqFineCentroids2, code2, weight2,
|
||||
outputAccum);
|
||||
|
||||
// clang-format on
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -553,6 +722,19 @@ struct IndexPQDecoderImpl<
|
|||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// clang-format on
|
||||
};
|
||||
|
||||
|
@ -607,6 +789,33 @@ struct IndexPQDecoder {
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -107,6 +107,52 @@ struct IndexPQDecoder {
|
|||
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
|
||||
}
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
#pragma unroll
|
||||
for (intptr_t i = 0; i < DIM; i++) {
|
||||
const intptr_t fineCentroidIdx = i / FINE_SIZE;
|
||||
const intptr_t fineCentroidOffset = i % FINE_SIZE;
|
||||
|
||||
const intptr_t fineCode0 = fine0[fineCentroidIdx];
|
||||
const intptr_t fineCode1 = fine1[fineCentroidIdx];
|
||||
const intptr_t fineCode2 = fine2[fineCentroidIdx];
|
||||
|
||||
const float* const __restrict finePtr0 = pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict finePtr1 = pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
const float* const __restrict finePtr2 = pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset;
|
||||
|
||||
outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
|
||||
weight2 * (*finePtr2);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -325,6 +325,107 @@ struct IndexPQDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 4 float
|
||||
// but 8 floats per loop
|
||||
|
||||
const intptr_t fineCode0a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode0b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine0);
|
||||
const intptr_t fineCode1a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode1b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine1);
|
||||
const intptr_t fineCode2a = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
|
||||
fine2);
|
||||
const intptr_t fineCode2b = detail::
|
||||
UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
|
||||
fine2);
|
||||
|
||||
auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids0 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode0a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids0 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode0b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids1 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode1a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids1 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode1b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x2bAccum(
|
||||
pqFineCentroids2 +
|
||||
((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
|
||||
fineCode2a) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
pqFineCentroids2 +
|
||||
((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
|
||||
fineCode2b) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -459,6 +560,79 @@ struct IndexPQDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 8 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
|
||||
const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
|
||||
|
||||
auto existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
{existingValue0, existingValue1});
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock8x1bAccum(
|
||||
pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
|
||||
vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
|
||||
|
@ -588,6 +762,77 @@ struct IndexPQDecoderImpl<
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
// fine quantizer
|
||||
const uint8_t* const __restrict fine0 = code0;
|
||||
const uint8_t* const __restrict fine1 = code1;
|
||||
const uint8_t* const __restrict fine2 = code2;
|
||||
|
||||
// process chunks, 4 float
|
||||
|
||||
const intptr_t fineCode0 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine0);
|
||||
const intptr_t fineCode1 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine1);
|
||||
const intptr_t fineCode2 =
|
||||
detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
|
||||
get(fine2);
|
||||
|
||||
auto existingValue = vld1q_f32(outputAccum + CPOS);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids0 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight0,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids1 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight1,
|
||||
existingValue);
|
||||
|
||||
existingValue = elementaryBlock4x1bAccum(
|
||||
pqFineCentroids2 +
|
||||
(fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
|
||||
FINE_SIZE +
|
||||
fineCentroidOffset,
|
||||
weight2,
|
||||
existingValue);
|
||||
|
||||
vst1q_f32(outputAccum + CPOS, existingValue);
|
||||
|
||||
// next
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
// This partial specialization is expected to do nothing.
|
||||
|
@ -629,6 +874,19 @@ struct IndexPQDecoderImpl<
|
|||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
float* const __restrict outputAccum) {}
|
||||
|
||||
// process 3 samples
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -663,7 +921,7 @@ struct IndexPQDecoder {
|
|||
|
||||
// process 2 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1)
|
||||
// decoded(code1)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
|
@ -681,6 +939,33 @@ struct IndexPQDecoder {
|
|||
weight1,
|
||||
outputAccum);
|
||||
}
|
||||
|
||||
// process 3 samples
|
||||
// Performs outputAccum += weight0 * decoded(code0) + weight1 *
|
||||
// decoded(code1) + weight2 * decoded(code2)
|
||||
static void accum(
|
||||
const float* const __restrict pqFineCentroids0,
|
||||
const uint8_t* const __restrict code0,
|
||||
const float weight0,
|
||||
const float* const __restrict pqFineCentroids1,
|
||||
const uint8_t* const __restrict code1,
|
||||
const float weight1,
|
||||
const float* const __restrict pqFineCentroids2,
|
||||
const uint8_t* const __restrict code2,
|
||||
const float weight2,
|
||||
float* const __restrict outputAccum) {
|
||||
IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
|
||||
pqFineCentroids0,
|
||||
code0,
|
||||
weight0,
|
||||
pqFineCentroids1,
|
||||
code1,
|
||||
weight1,
|
||||
pqFineCentroids2,
|
||||
code2,
|
||||
weight2,
|
||||
outputAccum);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace cppcontrib
|
||||
|
|
|
@ -201,6 +201,64 @@ void verifyIndex2LevelDecoder(
|
|||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// test contrib::accum, 3 samples per iteration.
|
||||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
const float weight0 = u(rng);
|
||||
const float weight1 = u(rng);
|
||||
const float weight2 = u(rng);
|
||||
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3s.data());
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
outputContrib1s.data());
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib1s.data());
|
||||
T::accum(
|
||||
pqCoarseCentroidsQ,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib1s.data());
|
||||
|
||||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -304,6 +362,58 @@ void verifyIndexPQDecoder(
|
|||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// test contrib::accum, 3 samples per iteration.
|
||||
rng.seed(123);
|
||||
|
||||
std::vector<float> outputContrib3s(d, 0);
|
||||
const size_t n3 = (n / 3) * 3;
|
||||
for (size_t i = 0; i < n3; i += 3) {
|
||||
// populate outputContribs with some existing data
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
outputContrib1s[j] = (j + 1) * (j + 1);
|
||||
outputContrib3s[j] = (j + 1) * (j + 1);
|
||||
}
|
||||
|
||||
// do a single step, 3 samples per step
|
||||
const float weight0 = u(rng);
|
||||
const float weight1 = u(rng);
|
||||
const float weight2 = u(rng);
|
||||
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib3s.data());
|
||||
|
||||
// do three steps, 1 sample per step
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 0) * codeSize,
|
||||
weight0,
|
||||
outputContrib1s.data());
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 1) * codeSize,
|
||||
weight1,
|
||||
outputContrib1s.data());
|
||||
T::accum(
|
||||
pqFineCentroidsQ,
|
||||
encodedData.data() + (i + 2) * codeSize,
|
||||
weight2,
|
||||
outputContrib1s.data());
|
||||
|
||||
// compare
|
||||
for (size_t j = 0; j < d; j++) {
|
||||
ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> generate(const size_t n, const size_t d) {
|
||||
|
|
Loading…
Reference in New Issue