faiss/gpu/impl/IVFFlat.cu

/**
 * Copyright (c) 2015-present, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the BSD+Patents license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Copyright 2004-present Facebook. All Rights Reserved.

#include "IVFFlat.cuh"
#include "../GpuResources.h"
#include "FlatIndex.cuh"
#include "InvertedListAppend.cuh"
#include "IVFFlatScan.cuh"
#include "RemapIndices.h"
#include "../utils/CopyUtils.cuh"
#include "../utils/DeviceDefs.cuh"
#include "../utils/DeviceUtils.h"
#include "../utils/Float16.cuh"
#include "../utils/HostTensor.cuh"
#include "../utils/Transpose.cuh"
#include <limits>
#include <thrust/host_vector.h>
#include <unordered_map>

namespace faiss { namespace gpu {

IVFFlat::IVFFlat(GpuResources* resources,
                 FlatIndex* quantizer,
                 bool l2Distance,
                 bool useFloat16,
                 IndicesOptions indicesOptions,
                 MemorySpace space) :
    IVFBase(resources,
            quantizer,
#ifdef FAISS_USE_FLOAT16
            useFloat16 ?
            sizeof(half) * quantizer->getDim()
            : sizeof(float) * quantizer->getDim(),
#else
            sizeof(float) * quantizer->getDim(),
#endif
            indicesOptions,
            space),
    l2Distance_(l2Distance),
    useFloat16_(useFloat16) {
#ifndef FAISS_USE_FLOAT16
  FAISS_ASSERT_MSG(!useFloat16, "float16 unsupported");
  useFloat16_ = false;
#endif
}

IVFFlat::~IVFFlat() {
}

void
IVFFlat::addCodeVectorsFromCpu(int listId,
                               const float* vecs,
                               const long* indices,
                               size_t numVecs) {
  // This list must already exist
  FAISS_ASSERT(listId < deviceListData_.size());
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // If there's nothing to add, then there's nothing we have to do
  if (numVecs == 0) {
    return;
  }

  size_t lengthInBytes = numVecs * bytesPerVector_;

  auto& listData = deviceListData_[listId];
  auto prevData = listData->data();

  // We only have int32 length representations on the GPU per each
  // list; the length is in sizeof(char)
  FAISS_ASSERT(listData->size() + lengthInBytes <=
         (size_t) std::numeric_limits<int>::max());

  if (useFloat16_) {
#ifdef FAISS_USE_FLOAT16
    // We have to convert data to the half format.
    // Make sure the source data is on our device first; it is not
    // guaranteed before function entry to avoid unnecessary h2d copies
    auto floatData =
      toDevice<float, 1>(resources_,
                         getCurrentDevice(),
                         (float*) vecs,
                         stream,
                         {(int) numVecs * dim_});
    auto halfData = toHalf<1>(resources_, stream, floatData);

    listData->append((unsigned char*) halfData.data(),
                     lengthInBytes,
                     stream,
                     true /* exact reserved size */);
#endif
  } else {
    listData->append((unsigned char*) vecs,
                     lengthInBytes,
                     stream,
                     true /* exact reserved size */);
  }

  // Handle the indices as well
  addIndicesFromCpu_(listId, indices, numVecs);

  // This list address may have changed due to vector resizing, but
  // only bother updating it on the device if it has changed
  if (prevData != listData->data()) {
    deviceListDataPointers_[listId] = listData->data();
  }

  // And our size has changed too
  int listLength = listData->size() / bytesPerVector_;
  deviceListLengths_[listId] = listLength;

  // We update this as well, since the multi-pass algorithm uses it
  maxListLength_ = std::max(maxListLength_, listLength);

  // device_vector add is potentially happening on a different stream
  // than our default stream
  if (stream != 0) {
    streamWait({stream}, {0});
  }
}

int
IVFFlat::classifyAndAddVectors(Tensor<float, 2, true>& vecs,
                               Tensor<long, 1, true>& indices) {
  FAISS_ASSERT(vecs.getSize(0) == indices.getSize(0));
  FAISS_ASSERT(vecs.getSize(1) == dim_);

  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // Number of valid vectors that we actually add; we return this
  int numAdded = 0;

  // We don't actually need this
  DeviceTensor<float, 2, true> listDistance(mem, {vecs.getSize(0), 1}, stream);
  // We use this
  DeviceTensor<int, 2, true> listIds2d(mem, {vecs.getSize(0), 1},  stream);
  auto listIds = listIds2d.view<1>({vecs.getSize(0)});

  quantizer_->query(vecs, 1, listDistance, listIds2d, false);

  // Copy the lists that we wish to append to back to the CPU
  // FIXME: really this can be into pinned memory and a true async
  // copy on a different stream; we can start the copy early, but it's
  // tiny
  HostTensor<int, 1, true> listIdsHost(listIds, stream);

  // Now we add the encoded vectors to the individual lists
  // First, make sure that there is space available for adding the new
  // encoded vectors and indices

  // list id -> # being added
  std::unordered_map<int, int> assignCounts;

  // vector id -> offset in list
  // (we already have vector id -> list id in listIds)
  HostTensor<int, 1, true> listOffsetHost({listIdsHost.getSize(0)});

  for (int i = 0; i < listIds.getSize(0); ++i) {
    int listId = listIdsHost[i];

    // Add vector could be invalid (contains NaNs etc)
    if (listId < 0) {
      listOffsetHost[i] = -1;
      continue;
    }

    FAISS_ASSERT(listId < numLists_);
    ++numAdded;

    int offset = deviceListData_[listId]->size() / bytesPerVector_;

    auto it = assignCounts.find(listId);
    if (it != assignCounts.end()) {
      offset += it->second;
      it->second++;
    } else {
      assignCounts[listId] = 1;
    }

    listOffsetHost[i] = offset;
  }

  // If we didn't add anything (all invalid vectors), no need to
  // continue
  if (numAdded == 0) {
    return 0;
  }

  // We need to resize the data structures for the inverted lists on
  // the GPUs, which means that they might need reallocation, which
  // means that their base address may change. Figure out the new base
  // addresses, and update those in a batch on the device
  {
    for (auto& counts : assignCounts) {
      auto& data = deviceListData_[counts.first];
      data->resize(data->size() + counts.second * bytesPerVector_,
                   stream);
      int newNumVecs = (int) (data->size() / bytesPerVector_);

      auto& indices = deviceListIndices_[counts.first];
      if ((indicesOptions_ == INDICES_32_BIT) ||
          (indicesOptions_ == INDICES_64_BIT)) {
        size_t indexSize =
          (indicesOptions_ == INDICES_32_BIT) ? sizeof(int) : sizeof(long);

        indices->resize(indices->size() + counts.second * indexSize, stream);
      } else if (indicesOptions_ == INDICES_CPU) {
        // indices are stored on the CPU side
        FAISS_ASSERT(counts.first < listOffsetToUserIndex_.size());

        auto& userIndices = listOffsetToUserIndex_[counts.first];
        userIndices.resize(newNumVecs);
      } else {
        // indices are not stored on the GPU or CPU side
        FAISS_ASSERT(indicesOptions_ == INDICES_IVF);
      }

      // This is used by the multi-pass query to decide how much scratch
      // space to allocate for intermediate results
      maxListLength_ = std::max(maxListLength_, newNumVecs);
    }

    // Update all pointers to the lists on the device that may have
    // changed
    {
      std::vector<int> listIds(assignCounts.size());
      int i = 0;
      for (auto& counts : assignCounts) {
        listIds[i++] = counts.first;
      }

      updateDeviceListInfo_(listIds, stream);
    }
  }

  // If we're maintaining the indices on the CPU side, update our
  // map. We already resized our map above.
  if (indicesOptions_ == INDICES_CPU) {
    // We need to maintain the indices on the CPU side
    HostTensor<long, 1, true> hostIndices(indices, stream);

    for (int i = 0; i < hostIndices.getSize(0); ++i) {
      int listId = listIdsHost[i];

      // Add vector could be invalid (contains NaNs etc)
      if (listId < 0) {
        continue;
      }

      int offset = listOffsetHost[i];

      FAISS_ASSERT(listId < listOffsetToUserIndex_.size());
      auto& userIndices = listOffsetToUserIndex_[listId];

      FAISS_ASSERT(offset < userIndices.size());
      userIndices[offset] = hostIndices[i];
    }
  }

  // We similarly need to actually append the new vectors
  {
    DeviceTensor<int, 1, true> listOffset(mem, listOffsetHost, stream);

    // Now, for each list to which a vector is being assigned, write it
    runIVFFlatInvertedListAppend(listIds,
                                 listOffset,
                                 vecs,
                                 indices,
                                 useFloat16_,
                                 deviceListDataPointers_,
                                 deviceListIndexPointers_,
                                 indicesOptions_,
                                 stream);
  }

  return numAdded;
}

void
IVFFlat::query(Tensor<float, 2, true>& queries,
               int nprobe,
               int k,
               Tensor<float, 2, true>& outDistances,
               Tensor<long, 2, true>& outIndices) {
  auto& mem = resources_->getMemoryManagerCurrentDevice();
  auto stream = resources_->getDefaultStreamCurrentDevice();

  // Validate these at a top level
  FAISS_ASSERT(nprobe <= 1024);
  FAISS_ASSERT(k <= 1024);
  nprobe = std::min(nprobe, quantizer_->getSize());

  FAISS_ASSERT(queries.getSize(1) == dim_);

  FAISS_ASSERT(outDistances.getSize(0) == queries.getSize(0));
  FAISS_ASSERT(outIndices.getSize(0) == queries.getSize(0));

  // Reserve space for the quantized information
  DeviceTensor<float, 2, true>
    coarseDistances(mem, {queries.getSize(0), nprobe}, stream);
  DeviceTensor<int, 2, true>
    coarseIndices(mem, {queries.getSize(0), nprobe}, stream);

  // Find the `nprobe` closest lists; we can use int indices both
  // internally and externally
  quantizer_->query(queries,
                    nprobe,
                    coarseDistances,
                    coarseIndices,
                    false);

  runIVFFlatScan(queries,
                 coarseIndices,
                 deviceListDataPointers_,
                 deviceListIndexPointers_,
                 indicesOptions_,
                 deviceListLengths_,
                 maxListLength_,
                 k,
                 l2Distance_,
                 useFloat16_,
                 outDistances,
                 outIndices,
                 resources_);

  // If the GPU isn't storing indices (they are on the CPU side), we
  // need to perform the re-mapping here
  // FIXME: we might ultimately be calling this function with inputs
  // from the CPU, these are unnecessary copies
  if (indicesOptions_ == INDICES_CPU) {
    HostTensor<long, 2, true> hostOutIndices(outIndices, stream);

    ivfOffsetToUserIndex(hostOutIndices.data(),
                         numLists_,
                         hostOutIndices.getSize(0),
                         hostOutIndices.getSize(1),
                         listOffsetToUserIndex_);

    // Copy back to GPU, since the input to this function is on the
    // GPU
    outIndices.copyFrom(hostOutIndices, stream);
  }
}

std::vector<float>
IVFFlat::getListVectors(int listId) const {
  FAISS_ASSERT(listId < deviceListData_.size());
  auto& encVecs = *deviceListData_[listId];

  auto stream = resources_->getDefaultStreamCurrentDevice();

  if (useFloat16_) {
#ifdef FAISS_USE_FLOAT16
    size_t num = encVecs.size() / sizeof(half);

    Tensor<half, 1, true> devHalf((half*) encVecs.data(), {(int) num});
    auto devFloat = fromHalf(resources_, stream, devHalf);

    std::vector<float> out(num);
    HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
    hostFloat.copyFrom(devFloat, stream);

    return out;
#endif
  }

  size_t num = encVecs.size() / sizeof(float);

  Tensor<float, 1, true> devFloat((float*) encVecs.data(), {(int) num});

  std::vector<float> out(num);
  HostTensor<float, 1, true> hostFloat(out.data(), {(int) num});
  hostFloat.copyFrom(devFloat, stream);

  return out;
}

} } // namespace