/**
 * Copyright (c) 2015-present, Facebook, Inc.
 * All rights reserved.
 *
 * This source code is licensed under the CC-by-NC license found in the
 * LICENSE file in the root directory of this source tree.
 */

// Copyright 2004-present Facebook. All Rights Reserved.

#include "GpuIndex.h"
#include "../FaissAssert.h"
#include "GpuResources.h"
#include "utils/DeviceUtils.h"
#include <stdio.h>

namespace faiss { namespace gpu {

// Default size for which we page add or search
constexpr size_t kAddPageSize = (size_t) 256 * 1024 * 1024;
constexpr size_t kSearchPageSize = (size_t) 256 * 1024 * 1024;

// Or, maximum number of vectors to consider per page of add or search
constexpr size_t kAddVecSize = (size_t) 512 * 1024;

// Use a smaller search size, as precomputed code usage on IVFPQ
// requires substantial amounts of memory
// FIXME: parameterize based on algorithm need
constexpr size_t kSearchVecSize = (size_t) 32 * 1024;

GpuIndex::GpuIndex(GpuResources* resources,
                   int dims,
                   faiss::MetricType metric,
                   GpuIndexConfig config) :
    Index(dims, metric),
    resources_(resources),
    device_(config.device),
    memorySpace_(config.memorySpace) {
  FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(),
                     "Invalid GPU device %d", device_);

  FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions");

#ifdef FAISS_UNIFIED_MEM
  FAISS_THROW_IF_NOT_FMT(
    memorySpace_ == MemorySpace::Device ||
    (memorySpace_ == MemorySpace::Unified &&
     getFullUnifiedMemSupport(device_)),
    "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
    config.device);
#else
  FAISS_THROW_IF_NOT_MSG(memorySpace_ == MemorySpace::Device,
                     "Must compile with CUDA 8+ for Unified Memory support");
#endif

  FAISS_ASSERT(resources_);
  resources_->initializeForDevice(device_);
}

void
GpuIndex::add(Index::idx_t n, const float* x) {
  addInternal_(n, x, nullptr);
}

void
GpuIndex::add_with_ids(Index::idx_t n,
                       const float* x,
                       const Index::idx_t* ids) {
  addInternal_(n, x, ids);
}

void
GpuIndex::addInternal_(Index::idx_t n,
                       const float* x,
                       const Index::idx_t* ids) {
  DeviceScope scope(device_);

  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");

  if (n > 0) {
    size_t totalSize = n * (size_t) this->d * sizeof(float);

    if (totalSize > kAddPageSize || n > kAddVecSize) {
      // How many vectors fit into kAddPageSize?
      size_t maxNumVecsForPageSize =
        kAddPageSize / ((size_t) this->d * sizeof(float));

      // Always add at least 1 vector, if we have huge vectors
      maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1);

      size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize);
      tileSize = std::min(tileSize, kSearchVecSize);

      for (size_t i = 0; i < n; i += tileSize) {
        size_t curNum = std::min(tileSize, n - i);

        addImpl_(curNum,
                 x + i * (size_t) this->d,
                 ids ? ids + i : nullptr);
      }
    } else {
      addImpl_(n, x, ids);
    }
  }
}

void
GpuIndex::search(Index::idx_t n,
                 const float* x,
                 Index::idx_t k,
                 float* distances,
                 Index::idx_t* labels) const {
  DeviceScope scope(device_);

  FAISS_THROW_IF_NOT_MSG(this->is_trained, "Index not trained");

  if (n > 0) {
    size_t totalSize = n * (size_t) this->d * sizeof(float);

    if ((totalSize > kSearchPageSize) || (n > kSearchVecSize)) {
      // How many vectors fit into kSearchPageSize?
      // Just consider `x`, not the size of `distances` or `labels`
      // since they should be small, relatively speaking
      size_t maxNumVecsForPageSize =
        kSearchPageSize / ((size_t) this->d * sizeof(float));

      // Always search at least 1 vector, if we have huge vectors
      maxNumVecsForPageSize = std::max(maxNumVecsForPageSize, (size_t) 1);

      size_t tileSize = std::min((size_t) n, maxNumVecsForPageSize);
      tileSize = std::min(tileSize, kSearchVecSize);

      for (size_t i = 0; i < n; i += tileSize) {
        size_t curNum = std::min(tileSize, n - i);

        searchImpl_(curNum,
                    x + i * (size_t) this->d,
                    k,
                    distances + i * k,
                    labels + i * k);
      }
    } else {
      searchImpl_(n, x, k, distances, labels);
    }
  }
}

} } // namespace