faiss/gpu/impl/IVFPQ.cu

/**
 * Copyright (c) Facebook, Inc. and its affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */


#include <faiss/gpu/impl/IVFPQ.cuh>
#include <faiss/gpu/GpuResources.h>
#include <faiss/gpu/impl/BroadcastSum.cuh>
#include <faiss/gpu/impl/Distance.cuh>
#include <faiss/gpu/impl/FlatIndex.cuh>
#include <faiss/gpu/impl/IVFAppend.cuh>
#include <faiss/gpu/impl/L2Norm.cuh>
#include <faiss/gpu/impl/PQCodeDistances.cuh>
#include <faiss/gpu/impl/PQScanMultiPassNoPrecomputed.cuh>
#include <faiss/gpu/impl/PQScanMultiPassPrecomputed.cuh>
#include <faiss/gpu/impl/RemapIndices.h>
#include <faiss/gpu/impl/VectorResidual.cuh>
#include <faiss/gpu/utils/ConversionOperators.cuh>
#include <faiss/gpu/utils/DeviceDefs.cuh>
#include <faiss/gpu/utils/DeviceUtils.h>
#include <faiss/gpu/utils/HostTensor.cuh>
#include <faiss/gpu/utils/MatrixMult.cuh>
#include <faiss/gpu/utils/NoTypeTensor.cuh>
#include <faiss/gpu/utils/Transpose.cuh>
#include <limits>
#include <thrust/host_vector.h>
#include <unordered_map>

namespace faiss { namespace gpu {

IVFPQ::IVFPQ(GpuResources* resources,
             faiss::MetricType metric,
             float metricArg,
             FlatIndex* quantizer,
             int numSubQuantizers,
             int bitsPerSubQuantizer,
             float* pqCentroidData,
             IndicesOptions indicesOptions,
             bool useFloat16LookupTables,
             MemorySpace space) :
    IVFBase(resources,
            metric,
            metricArg,
            quantizer,
            numSubQuantizers,
            indicesOptions,
            space),
    numSubQuantizers_(numSubQuantizers),
    bitsPerSubQuantizer_(bitsPerSubQuantizer),
    numSubQuantizerCodes_(utils::pow2(bitsPerSubQuantizer_)),
    dimPerSubQuantizer_(dim_ / numSubQuantizers),
    precomputedCodes_(false),
    useFloat16LookupTables_(useFloat16LookupTables) {
  FAISS_ASSERT(pqCentroidData);

  FAISS_ASSERT(bitsPerSubQuantizer_ <= 8);
  FAISS_ASSERT(dim_ % numSubQuantizers_ == 0);
  FAISS_ASSERT(isSupportedPQCodeLength(bytesPerVector_));

  setPQCentroids_(pqCentroidData);
}

IVFPQ::~IVFPQ() {
}


bool
IVFPQ::isSupportedPQCodeLength(int size) {
  switch (size) {
    case 1:
    case 2:
    case 3:
    case 4:
    case 8:
    case 12:
    case 16:
    case 20:
    case 24:
    case 28:
    case 32:
    case 40:
    case 48:
    case 56: // only supported with float16
    case 64: // only supported with float16
    case 96: // only supported with float16
      return true;
    default:
      return false;
  }
}

bool
IVFPQ::isSupportedNoPrecomputedSubDimSize(int dims) {
  return faiss::gpu::isSupportedNoPrecomputedSubDimSize(dims);
}

void
IVFPQ::setPrecomputedCodes(bool enable) {
  if (enable && metric_ == MetricType::METRIC_INNER_PRODUCT) {
    FAISS_THROW_MSG("Precomputed codes are not needed for GpuIndexIVFPQ "
                    "with METRIC_INNER_PRODUCT");
  }

  if (precomputedCodes_ != enable) {
    precomputedCodes_ = enable;

    if (precomputedCodes_) {
      precomputeCodes_();
    } else {
      // Clear out old precomputed code data
      precomputedCode_ = std::move(DeviceTensor<float, 3, true>());
      precomputedCodeHalf_ = std::move(DeviceTensor<half, 3, true>());
    }
  }
}

int
IVFPQ::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                             Tensor<long, 1, true>& indices) {
  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
  FAISS_ASSERT(vecs.getSize(1) == dim_);

  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // Number of valid vectors that we actually add; we return this
  int numAdded = 0;

  // We don't actually need this
  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
  // We use this
  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1}, stream);
  auto listIds = listIds2d.view<1>({vecs.getSize(0)});

  quantizer_->query(vecs,
                    1,
                    metric_,
                    metricArg_,
                    listDistance,
                    listIds2d,
                    false);

  // Copy the lists that we wish to append to back to the CPU
  // FIXME: really this can be into pinned memory and a true async
  // copy on a different stream; we can start the copy early, but it's
  // tiny
  HostTensor<int, 1, true> listIdsHost(listIds, stream);

  // Calculate the residual for each closest centroid
  DeviceTensor<float, 2, true> residuals(
    mem, {vecs.getSize(0), vecs.getSize(1)}, stream);

  if (quantizer_->getUseFloat16()) {
    auto& coarseCentroids = quantizer_->getVectorsFloat16Ref();
    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
  } else {
    auto& coarseCentroids = quantizer_->getVectorsFloat32Ref();
    runCalcResidual(vecs, coarseCentroids, listIds, residuals, stream);
  }

  // Residuals are in the form
  // (vec x numSubQuantizer x dimPerSubQuantizer)
  // transpose to
  // (numSubQuantizer x vec x dimPerSubQuantizer)
  auto residualsView = residuals.view<3>(
    {residuals.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});

  DeviceTensor<float, 3, true> residualsTranspose(
    mem,
    {numSubQuantizers_, residuals.getSize(0), dimPerSubQuantizer_},
    stream);

  runTransposeAny(residualsView, 0, 1, residualsTranspose, stream);

  // Get the product quantizer centroids in the form
  // (numSubQuantizer x numSubQuantizerCodes x dimPerSubQuantizer)
  // which is pqCentroidsMiddleCode_

  // We now have a batch operation to find the top-1 distances:
  // batch size: numSubQuantizer
  // centroids: (numSubQuantizerCodes x dimPerSubQuantizer)
  // residuals: (vec x dimPerSubQuantizer)
  // => (numSubQuantizer x vec x 1)

  DeviceTensor<float, 3, true> closestSubQDistance(
    mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);
  DeviceTensor<int, 3, true> closestSubQIndex(
    mem, {numSubQuantizers_, residuals.getSize(0), 1}, stream);

  for (int subQ = 0; subQ < numSubQuantizers_; ++subQ) {
    auto closestSubQDistanceView = closestSubQDistance[subQ].view();
    auto closestSubQIndexView = closestSubQIndex[subQ].view();

    auto pqCentroidsMiddleCodeView = pqCentroidsMiddleCode_[subQ].view();
    auto residualsTransposeView = residualsTranspose[subQ].view();

    runL2Distance(resources_,
                  pqCentroidsMiddleCodeView,
                  true, // pqCentroidsMiddleCodeView is row major
                  nullptr, // no precomputed norms
                  residualsTransposeView,
                  true, // residualsTransposeView is row major
                  1,
                  closestSubQDistanceView,
                  closestSubQIndexView,
                  // We don't care about distances
                  true);
  }

  // Now, we have the nearest sub-q centroid for each slice of the
  // residual vector.
  auto closestSubQIndexView = closestSubQIndex.view<2>(
    {numSubQuantizers_, residuals.getSize(0)});

  // Transpose this for easy use
  DeviceTensor<int, 2, true> encodings(
    mem, {residuals.getSize(0), numSubQuantizers_}, stream);

  runTransposeAny(closestSubQIndexView, 0, 1, encodings, stream);

  // Now we add the encoded vectors to the individual lists
  // First, make sure that there is space available for adding the new
  // encoded vectors and indices

  // list id -> # being added
  std::unordered_map<int, int> assignCounts;

  // vector id -> offset in list
  // (we already have vector id -> list id in listIds)
  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});

  for (int i = 0; i < listIdsHost.getSize(0); ++i) {
    int listId = listIdsHost[i];

    // Add vector could be invalid (contains NaNs etc)
    if (listId < 0) {
      listOffsetHost[i] = -1;
      continue;
    }

    FAISS_ASSERT(listId < numLists_);
    ++numAdded;

    int offset = deviceListData_[listId]->size() / bytesPerVector_;

    auto it = assignCounts.find(listId);
    if (it != assignCounts.end()) {
      offset += it->second;
      it->second++;
    } else {
      assignCounts[listId] = 1;
    }

    listOffsetHost[i] = offset;
  }

  // If we didn't add anything (all invalid vectors), no need to
  // continue
  if (numAdded == 0) {
    return 0;
  }

  // We need to resize the data structures for the inverted lists on
  // the GPUs, which means that they might need reallocation, which
  // means that their base address may change. Figure out the new base
  // addresses, and update those in a batch on the device
  {
    // Resize all of the lists that we are appending to
    for (auto& counts : assignCounts) {
      auto& codes = deviceListData_[counts.first];
      codes->resize(codes->size() + counts.second * bytesPerVector_,
                    stream);
      int newNumVecs = (int) (codes->size() / bytesPerVector_);

      auto& indices = deviceListIndices_[counts.first];
      if ((indicesOptions_ == INDICES_32_BIT) ||
          (indicesOptions_ == INDICES_64_BIT)) {
        size_t indexSize =
          (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);

        indices->resize(indices->size() + counts.second * indexSize, stream);
      } else if (indicesOptions_ == INDICES_CPU) {
        // indices are stored on the CPU side
        FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());

        auto& userIndices = listOffsetToUserIndex_[counts.first];
        userIndices.resize(newNumVecs);
      } else {
        // indices are not stored on the GPU or CPU side
        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
      }

      // This is used by the multi-pass query to decide how much scratch
      // space to allocate for intermediate results
      maxListLength_ = std::max(maxListLength_, newNumVecs);
    }

    // Update all pointers and sizes on the device for lists that we
    // appended to
    {
      std::vector<int> listIds(assignCounts.size());
      int i = 0;
      for (auto& counts : assignCounts) {
        listIds[i++] = counts.first;
      }

      updateDeviceListInfo_(listIds, stream);
    }
  }

  // If we're maintaining the indices on the CPU side, update our
  // map. We already resized our map above.
  if (indicesOptions_ == INDICES_CPU) {
    // We need to maintain the indices on the CPU side
    HostTensor<long, 1, true> hostIndices(indices, stream);

    for (int i = 0; i < hostIndices.getSize(0); ++i) {
      int listId = listIdsHost[i];

      // Add vector could be invalid (contains NaNs etc)
      if (listId < 0) {
        continue;
      }

      int offset = listOffsetHost[i];

      FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
      auto& userIndices = listOffsetToUserIndex_[listId];

      FAISS_ASSERT(offset < userIndices.size());
      userIndices[offset] = hostIndices[i];
    }
  }

  // We similarly need to actually append the new encoded vectors
  {
    DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);

    // This kernel will handle appending each encoded vector + index to
    // the appropriate list
    runIVFPQInvertedListAppend(listIds,
                               listOffset,
                               encodings,
                               indices,
                               deviceListDataPointers_,
                               deviceListIndexPointers_,
                               indicesOptions_,
                               stream);
  }

  return numAdded;
}

void
IVFPQ::addCodeVectorsFromCpu(int listId,
                             const void* codes,
                             const long* indices,
                             size_t numVecs) {
  // This list must already exist
  FAISS_ASSERT(listId < deviceListData_.size());
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // If there's nothing to add, then there's nothing we have to do
  if (numVecs == 0) {
    return;
  }

  size_t lengthInBytes = numVecs * bytesPerVector_;

  auto& listCodes = deviceListData_[listId];
  auto prevCodeData = listCodes->data();

  // We only have int32 length representations on the GPU per each
  // list; the length is in sizeof(char)
  FAISS_ASSERT(listCodes->size() % bytesPerVector_ == 0);
  FAISS_ASSERT(listCodes->size() + lengthInBytes <=
               (size_t) std::numeric_limits<int>::max());

  listCodes->append((unsigned char*) codes,
                    lengthInBytes,
                    stream,
                    true /* exact reserved size */);

  // Handle the indices as well
  addIndicesFromCpu_(listId, indices, numVecs);

  // This list address may have changed due to vector resizing, but
  // only bother updating it on the device if it has changed
  if (prevCodeData != listCodes->data()) {
    deviceListDataPointers_[listId] = listCodes->data();
  }

  // And our size has changed too
  int listLength = listCodes->size() / bytesPerVector_;
  deviceListLengths_[listId] = listLength;

  // We update this as well, since the multi-pass algorithm uses it
  maxListLength_ = std::max(maxListLength_, listLength);

  // device_vector add is potentially happening on a different stream
  // than our default stream
  if (resources_->getDefaultStreamCurrentDevice() != 0) {
    streamWait({stream}, {0});
  }
}

void
IVFPQ::setPQCentroids_(float* data) {
  size_t pqSize =
    numSubQuantizers_ * numSubQuantizerCodes_ * dimPerSubQuantizer_;

  // Make sure the data is on the host
  // FIXME: why are we doing this?
  thrust::host_vector<float> hostMemory;
  hostMemory.insert(hostMemory.end(), data, data + pqSize);

  HostTensor<float, 3, true> pqHost(
    hostMemory.data(),
    {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
  DeviceTensor<float, 3, true> pqDevice(
    pqHost,
    resources_->getDefaultStreamCurrentDevice());

  DeviceTensor<float, 3, true> pqDeviceTranspose(
    {numSubQuantizers_, dimPerSubQuantizer_, numSubQuantizerCodes_});
  runTransposeAny(pqDevice, 1, 2, pqDeviceTranspose,
                  resources_->getDefaultStreamCurrentDevice());

  pqCentroidsInnermostCode_ = std::move(pqDeviceTranspose);

  // Also maintain the PQ centroids in the form
  // (sub q)(code id)(sub dim)
  DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
    {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
  runTransposeAny(pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode,
                  resources_->getDefaultStreamCurrentDevice());

  pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
}

template <typename CentroidT>
void
IVFPQ::precomputeCodesT_() {
  FAISS_ASSERT(metric_ == MetricType::METRIC_L2);

  //
  //    d = || x - y_C ||^2 + || y_R ||^2 + 2 * (y_C|y_R) - 2 * (x|y_R)
  //        ---------------   ---------------------------       -------
  //            term 1                 term 2                   term 3
  //

  // Terms 1 and 3 are available only at query time. We compute term 2
  // here.

  // Compute ||y_R||^2 by treating
  // (sub q)(code id)(sub dim) as (sub q * code id)(sub dim)
  auto pqCentroidsMiddleCodeView =
    pqCentroidsMiddleCode_.view<2>(
      {numSubQuantizers_ * numSubQuantizerCodes_, dimPerSubQuantizer_});
  DeviceTensor<float, 1, true> subQuantizerNorms(
    {numSubQuantizers_ * numSubQuantizerCodes_});

  runL2Norm(pqCentroidsMiddleCodeView, true,
            subQuantizerNorms, true,
            resources_->getDefaultStreamCurrentDevice());

  // Compute 2 * (y_C|y_R) via batch matrix multiplication
  // batch size (sub q) x {(centroid id)(sub dim) x (code id)(sub dim)'}
  //         => (sub q) x {(centroid id)(code id)}
  //         => (sub q)(centroid id)(code id)

  // View (centroid id)(dim) as
  //      (centroid id)(sub q)(dim)
  // Transpose (centroid id)(sub q)(sub dim) to
  //           (sub q)(centroid id)(sub dim)
  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();
  auto centroidView = coarseCentroids.template view<3>(
    {coarseCentroids.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
  DeviceTensor<CentroidT, 3, true> centroidsTransposed(
    {numSubQuantizers_, coarseCentroids.getSize(0), dimPerSubQuantizer_});

  runTransposeAny(centroidView, 0, 1, centroidsTransposed,
                  resources_->getDefaultStreamCurrentDevice());

  DeviceTensor<float, 3, true> coarsePQProduct(
    {numSubQuantizers_, coarseCentroids.getSize(0), numSubQuantizerCodes_});

  runIteratedMatrixMult(coarsePQProduct, false,
                        centroidsTransposed, false,
                        pqCentroidsMiddleCode_, true,
                        2.0f, 0.0f,
                        resources_->getBlasHandleCurrentDevice(),
                        resources_->getDefaultStreamCurrentDevice());

  // Transpose (sub q)(centroid id)(code id) to
  //           (centroid id)(sub q)(code id)
  DeviceTensor<float, 3, true> coarsePQProductTransposed(
    {coarseCentroids.getSize(0), numSubQuantizers_, numSubQuantizerCodes_});
  runTransposeAny(coarsePQProduct, 0, 1, coarsePQProductTransposed,
                  resources_->getDefaultStreamCurrentDevice());

  // View (centroid id)(sub q)(code id) as
  //      (centroid id)(sub q * code id)
  auto coarsePQProductTransposedView = coarsePQProductTransposed.view<2>(
    {coarseCentroids.getSize(0), numSubQuantizers_ * numSubQuantizerCodes_});

  // Sum || y_R ||^2 + 2 * (y_C|y_R)
  // i.e., add norms                              (sub q * code id)
  // along columns of inner product  (centroid id)(sub q * code id)
  runSumAlongColumns(subQuantizerNorms, coarsePQProductTransposedView,
                     resources_->getDefaultStreamCurrentDevice());

  // We added into the view, so `coarsePQProductTransposed` is now our
  // precomputed term 2.
  if (useFloat16LookupTables_) {
    precomputedCodeHalf_ =
      convertTensor<float, half, 3>(resources_,
                                    resources_->getDefaultStreamCurrentDevice(),
                                    coarsePQProductTransposed);
  } else {
    precomputedCode_ = std::move(coarsePQProductTransposed);
  }
}

void
IVFPQ::precomputeCodes_() {
  if (quantizer_->getUseFloat16()) {
    precomputeCodesT_<half>();
  } else {
    precomputeCodesT_<float>();
  }
}

void
IVFPQ::query(Tensor<float, 2, true>& queries,
             int nprobe,
             int k,
             Tensor<float, 2, true>& outDistances,
             Tensor<long, 2, true>& outIndices) {
  // These are caught at a higher level
  FAISS_ASSERT(nprobe <= GPU_MAX_SELECTION_K);
  FAISS_ASSERT(k <= GPU_MAX_SELECTION_K);

  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();
  nprobe = std::min(nprobe, quantizer_->getSize());

  FAISS_ASSERT(queries.getSize(1) == dim_);
  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));

  // Reserve space for the closest coarse centroids
  DeviceTensor<float, 2, true>
    coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
  DeviceTensor<int, 2, true>
    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);

  // Find the `nprobe` closest coarse centroids; we can use int
  // indices both internally and externally
  quantizer_->query(queries,
                    nprobe,
                    metric_,
                    metricArg_,
                    coarseDistances,
                    coarseIndices,
                    true);

  if (precomputedCodes_) {
    FAISS_ASSERT(metric_ == MetricType::METRIC_L2);

    runPQPrecomputedCodes_(queries,
                           coarseDistances,
                           coarseIndices,
                           k,
                           outDistances,
                           outIndices);
  } else {
    runPQNoPrecomputedCodes_(queries,
                             coarseDistances,
                             coarseIndices,
                             k,
                             outDistances,
                             outIndices);
  }

  // If the GPU isn't storing indices (they are on the CPU side), we
  // need to perform the re-mapping here
  // FIXME: we might ultimately be calling this function with inputs
  // from the CPU, these are unnecessary copies
  if (indicesOptions_ == INDICES_CPU) {
    HostTensor<long, 2, true> hostOutIndices(outIndices, stream);

    ivfOffsetToUserIndex(hostOutIndices.data(),
                         numLists_,
                         hostOutIndices.getSize(0),
                         hostOutIndices.getSize(1),
                         listOffsetToUserIndex_);

    // Copy back to GPU, since the input to this function is on the
    // GPU
    outIndices.copyFrom(hostOutIndices, stream);
  }
}

std::vector<unsigned char>
IVFPQ::getListCodes(int listId) const {
  FAISS_ASSERT(listId < deviceListData_.size());

  return deviceListData_[listId]->copyToHost<unsigned char>(
    resources_->getDefaultStreamCurrentDevice());
}

Tensor<float, 3, true>
IVFPQ::getPQCentroids() {
  return pqCentroidsMiddleCode_;
}

void
IVFPQ::runPQPrecomputedCodes_(
  Tensor<float, 2, true>& queries,
  DeviceTensor<float, 2, true>& coarseDistances,
  DeviceTensor<int, 2, true>& coarseIndices,
  int k,
  Tensor<float, 2, true>& outDistances,
  Tensor<long, 2, true>& outIndices) {
  FAISS_ASSERT(metric_ == MetricType::METRIC_L2);

  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // Compute precomputed code term 3, - 2 * (x|y_R)
  // This is done via batch MM
  // {sub q} x {(query id)(sub dim) * (code id)(sub dim)'} =>
  // {sub q} x {(query id)(code id)}
  DeviceTensor<float, 3, true> term3Transposed(
    mem,
    {queries.getSize(0), numSubQuantizers_, numSubQuantizerCodes_},
    stream);

  // These allocations within are only temporary, so release them when
  // we're done to maximize free space
  {
    auto querySubQuantizerView = queries.view<3>(
      {queries.getSize(0), numSubQuantizers_, dimPerSubQuantizer_});
    DeviceTensor<float, 3, true> queriesTransposed(
      mem,
      {numSubQuantizers_, queries.getSize(0), dimPerSubQuantizer_},
      stream);
    runTransposeAny(querySubQuantizerView, 0, 1, queriesTransposed, stream);

    DeviceTensor<float, 3, true> term3(
      mem,
      {numSubQuantizers_, queries.getSize(0), numSubQuantizerCodes_},
      stream);

    runIteratedMatrixMult(term3, false,
                          queriesTransposed, false,
                          pqCentroidsMiddleCode_, true,
                          -2.0f, 0.0f,
                          resources_->getBlasHandleCurrentDevice(),
                          stream);

    runTransposeAny(term3, 0, 1, term3Transposed, stream);
  }

  NoTypeTensor<3, true> term2;
  NoTypeTensor<3, true> term3;
  DeviceTensor<half, 3, true> term3Half;

  if (useFloat16LookupTables_) {
    term3Half =
      convertTensor<float, half, 3>(resources_, stream, term3Transposed);

    term2 = NoTypeTensor<3, true>(precomputedCodeHalf_);
    term3 = NoTypeTensor<3, true>(term3Half);
  } else {
    term2 = NoTypeTensor<3, true>(precomputedCode_);
    term3 = NoTypeTensor<3, true>(term3Transposed);
  }

  runPQScanMultiPassPrecomputed(queries,
                                coarseDistances, // term 1
                                term2, // term 2
                                term3, // term 3
                                coarseIndices,
                                useFloat16LookupTables_,
                                bytesPerVector_,
                                numSubQuantizers_,
                                numSubQuantizerCodes_,
                                deviceListDataPointers_,
                                deviceListIndexPointers_,
                                indicesOptions_,
                                deviceListLengths_,
                                maxListLength_,
                                k,
                                outDistances,
                                outIndices,
                                resources_);
}

template <typename CentroidT>
void
IVFPQ::runPQNoPrecomputedCodesT_(
  Tensor<float, 2, true>& queries,
  DeviceTensor<float, 2, true>& coarseDistances,
  DeviceTensor<int, 2, true>& coarseIndices,
  int k,
  Tensor<float, 2, true>& outDistances,
  Tensor<long, 2, true>& outIndices) {
  auto& coarseCentroids = quantizer_->template getVectorsRef<CentroidT>();

  runPQScanMultiPassNoPrecomputed(queries,
                                  coarseCentroids,
                                  pqCentroidsInnermostCode_,
                                  coarseIndices,
                                  useFloat16LookupTables_,
                                  bytesPerVector_,
                                  numSubQuantizers_,
                                  numSubQuantizerCodes_,
                                  deviceListDataPointers_,
                                  deviceListIndexPointers_,
                                  indicesOptions_,
                                  deviceListLengths_,
                                  maxListLength_,
                                  k,
                                  metric_,
                                  outDistances,
                                  outIndices,
                                  resources_);
}

void
IVFPQ::runPQNoPrecomputedCodes_(
  Tensor<float, 2, true>& queries,
  DeviceTensor<float, 2, true>& coarseDistances,
  DeviceTensor<int, 2, true>& coarseIndices,
  int k,
  Tensor<float, 2, true>& outDistances,
  Tensor<long, 2, true>& outIndices) {
  if (quantizer_->getUseFloat16()) {
    runPQNoPrecomputedCodesT_<half>(queries,
                                    coarseDistances,
                                    coarseIndices,
                                    k,
                                    outDistances,
                                    outIndices);
  } else {
    runPQNoPrecomputedCodesT_<float>(queries,
                                     coarseDistances,
                                     coarseIndices,
                                     k,
                                     outDistances,
                                     outIndices);
  }
}

} } // namespace