diff --git a/contrib/pytorch_tensors.py b/contrib/pytorch_tensors.py index ae559582f..ab3d0a2c0 100644 --- a/contrib/pytorch_tensors.py +++ b/contrib/pytorch_tensors.py @@ -1,5 +1,6 @@ import faiss import torch +import contextlib def swig_ptr_from_FloatTensor(x): """ gets a Faiss SWIG pointer from a pytorch trensor (on CPU or GPU) """ @@ -15,9 +16,33 @@ def swig_ptr_from_LongTensor(x): return faiss.cast_integer_to_long_ptr( x.storage().data_ptr() + x.storage_offset() * 8) +@contextlib.contextmanager +def using_stream(res, pytorch_stream=None): + r""" Creates a scoping object to make Faiss GPU use the same stream + as pytorch, based on torch.cuda.current_stream(). + Or, a specific pytorch stream can be passed in as a second + argument, in which case we will use that stream. + """ + if pytorch_stream is None: + pytorch_stream = torch.cuda.current_stream() -def search_index_pytorch(index, x, k, D=None, I=None): + # This is the cudaStream_t that we wish to use + cuda_stream_s = faiss.cast_integer_to_cudastream_t(pytorch_stream.cuda_stream) + + # So we can revert GpuResources stream state upon exit + prior_dev = torch.cuda.current_device() + prior_stream = res.getDefaultStream(torch.cuda.current_device()) + + res.setDefaultStream(torch.cuda.current_device(), cuda_stream_s) + + # Do the user work + try: + yield + finally: + res.setDefaultStream(prior_dev, prior_stream) + +def search_index_pytorch(res, index, x, k, D=None, I=None): """call the search function of an index with pytorch tensor I/O (CPU and GPU supported)""" assert x.is_contiguous() @@ -33,13 +58,13 @@ def search_index_pytorch(index, x, k, D=None, I=None): I = torch.empty((n, k), dtype=torch.int64, device=x.device) else: assert I.size() == (n, k) - torch.cuda.synchronize() - xptr = swig_ptr_from_FloatTensor(x) - Iptr = swig_ptr_from_LongTensor(I) - Dptr = swig_ptr_from_FloatTensor(D) - index.search_c(n, xptr, - k, Dptr, Iptr) - torch.cuda.synchronize() + + with using_stream(res): + xptr = swig_ptr_from_FloatTensor(x) + Iptr = swig_ptr_from_LongTensor(I) + Dptr = swig_ptr_from_FloatTensor(D) + index.search_c(n, xptr, k, Dptr, Iptr) + return D, I @@ -97,6 +122,8 @@ def search_raw_array_pytorch(res, xb, xq, k, D=None, I=None, args.numQueries = nq args.outDistances = D_ptr args.outIndices = I_ptr - faiss.bfKnn(res, args) + + with using_stream(res): + faiss.bfKnn(res, args) return D, I diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 9570d3747..40ba34926 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -42,25 +42,29 @@ GpuIndex::GpuIndex(std::shared_ptr resources, GpuIndexConfig config) : Index(dims, metric), resources_(resources), - device_(config.device), - memorySpace_(config.memorySpace), + config_(config), minPagedSize_(kMinPageSize) { - FAISS_THROW_IF_NOT_FMT(device_ < getNumDevices(), - "Invalid GPU device %d", device_); + FAISS_THROW_IF_NOT_FMT(config_.device < getNumDevices(), + "Invalid GPU device %d", config_.device); FAISS_THROW_IF_NOT_MSG(dims > 0, "Invalid number of dimensions"); FAISS_THROW_IF_NOT_FMT( - memorySpace_ == MemorySpace::Device || - (memorySpace_ == MemorySpace::Unified && - getFullUnifiedMemSupport(device_)), + config_.memorySpace == MemorySpace::Device || + (config_.memorySpace == MemorySpace::Unified && + getFullUnifiedMemSupport(config_.device)), "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)", config.device); metric_arg = metricArg; FAISS_ASSERT((bool) resources_); - resources_->initializeForDevice(device_); + resources_->initializeForDevice(config_.device); +} + +int +GpuIndex::getDevice() const { + return config_.device; } void @@ -124,7 +128,7 @@ GpuIndex::add_with_ids(Index::idx_t n, } } - DeviceScope scope(device_); + DeviceScope scope(config_.device); addPaged_((int) n, x, ids ? ids : generatedIds.data()); } @@ -171,7 +175,7 @@ GpuIndex::addPage_(int n, auto vecs = toDeviceTemporary(resources_.get(), - device_, + config_.device, const_cast(x), stream, {n, this->d}); @@ -179,7 +183,7 @@ GpuIndex::addPage_(int n, if (ids) { auto indices = toDeviceTemporary(resources_.get(), - device_, + config_.device, const_cast(ids), stream, {n}); @@ -214,8 +218,8 @@ GpuIndex::search(Index::idx_t n, return; } - DeviceScope scope(device_); - auto stream = resources_->getDefaultStream(device_); + DeviceScope scope(config_.device); + auto stream = resources_->getDefaultStream(config_.device); // We guarantee that the searchImpl_ will be called with device-resident // pointers. @@ -228,12 +232,12 @@ GpuIndex::search(Index::idx_t n, // another level of tiling. auto outDistances = toDeviceTemporary( - resources_.get(), device_, distances, stream, + resources_.get(), config_.device, distances, stream, {(int) n, (int) k}); auto outLabels = toDeviceTemporary( - resources_.get(), device_, labels, stream, + resources_.get(), config_.device, labels, stream, {(int) n, (int) k}); bool usePaged = false; @@ -272,12 +276,12 @@ GpuIndex::searchNonPaged_(int n, int k, float* outDistancesData, Index::idx_t* outIndicesData) const { - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); // Make sure arguments are on the device we desire; use temporary // memory allocations to move it if necessary auto vecs = toDeviceTemporary(resources_.get(), - device_, + config_.device, const_cast(x), stream, {n, (int) this->d}); @@ -335,8 +339,8 @@ GpuIndex::searchFromCpuPaged_(int n, // 1 2 3 1 ... (pinned buf A) // time -> // - auto defaultStream = resources_->getDefaultStream(device_); - auto copyStream = resources_->getAsyncCopyStream(device_); + auto defaultStream = resources_->getDefaultStream(config_.device); + auto copyStream = resources_->getAsyncCopyStream(config_.device); FAISS_ASSERT((size_t) pageSizeInVecs * this->d <= (size_t) std::numeric_limits::max()); diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h index 567d4423f..4edaa23c8 100644 --- a/faiss/gpu/GpuIndex.h +++ b/faiss/gpu/GpuIndex.h @@ -36,9 +36,8 @@ class GpuIndex : public faiss::Index { float metricArg, GpuIndexConfig config); - inline int getDevice() const { - return device_; - } + /// Returns the device that this index is resident on + int getDevice() const; /// Set the minimum data size for searches (in MiB) for which we use /// CPU -> GPU paging @@ -136,11 +135,8 @@ private: /// Manages streams, cuBLAS handles and scratch memory for devices std::shared_ptr resources_; - /// The GPU device we are resident on - const int device_; - - /// The memory space of our primary storage on the GPU - const MemorySpace memorySpace_; + /// Our configuration options + const GpuIndexConfig config_; /// Size above which we page copies from the CPU to GPU size_t minPagedSize_; diff --git a/faiss/gpu/GpuIndexBinaryFlat.cu b/faiss/gpu/GpuIndexBinaryFlat.cu index 18a95fce6..556dff36b 100644 --- a/faiss/gpu/GpuIndexBinaryFlat.cu +++ b/faiss/gpu/GpuIndexBinaryFlat.cu @@ -23,7 +23,7 @@ GpuIndexBinaryFlat::GpuIndexBinaryFlat(GpuResourcesProvider* provider, GpuIndexBinaryFlatConfig config) : IndexBinary(index->d), resources_(provider->getResources()), - config_(std::move(config)) { + binaryFlatConfig_(config) { FAISS_THROW_IF_NOT_FMT(this->d % 8 == 0, "vector dimension (number of bits) " "must be divisible by 8 (passed %d)", @@ -41,7 +41,7 @@ GpuIndexBinaryFlat::GpuIndexBinaryFlat(GpuResourcesProvider* provider, GpuIndexBinaryFlatConfig config) : IndexBinary(dims), resources_(provider->getResources()), - config_(std::move(config)) { + binaryFlatConfig_(std::move(config)) { FAISS_THROW_IF_NOT_FMT(this->d % 8 == 0, "vector dimension (number of bits) " "must be divisible by 8 (passed %d)", @@ -51,17 +51,22 @@ GpuIndexBinaryFlat::GpuIndexBinaryFlat(GpuResourcesProvider* provider, this->is_trained = true; // Construct index - DeviceScope scope(config_.device); - data_.reset( - new BinaryFlatIndex(resources_.get(), this->d, config_.memorySpace)); + DeviceScope scope(binaryFlatConfig_.device); + data_.reset(new BinaryFlatIndex(resources_.get(), + this->d, binaryFlatConfig_.memorySpace)); } GpuIndexBinaryFlat::~GpuIndexBinaryFlat() { } +int +GpuIndexBinaryFlat::getDevice() const { + return binaryFlatConfig_.device; +} + void GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) { - DeviceScope scope(config_.device); + DeviceScope scope(binaryFlatConfig_.device); this->d = index->d; @@ -76,20 +81,20 @@ GpuIndexBinaryFlat::copyFrom(const faiss::IndexBinaryFlat* index) { // destroy old first before allocating new data_.reset(); - data_.reset( - new BinaryFlatIndex(resources_.get(), this->d, config_.memorySpace)); + data_.reset(new BinaryFlatIndex(resources_.get(), + this->d, binaryFlatConfig_.memorySpace)); // The index could be empty if (index->ntotal > 0) { data_->add(index->xb.data(), index->ntotal, - resources_->getDefaultStream(config_.device)); + resources_->getDefaultStream(binaryFlatConfig_.device)); } } void GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const { - DeviceScope scope(config_.device); + DeviceScope scope(binaryFlatConfig_.device); index->d = this->d; index->ntotal = this->ntotal; @@ -101,18 +106,18 @@ GpuIndexBinaryFlat::copyTo(faiss::IndexBinaryFlat* index) const { if (this->ntotal > 0) { fromDevice(data_->getVectorsRef(), index->xb.data(), - resources_->getDefaultStream(config_.device)); + resources_->getDefaultStream(binaryFlatConfig_.device)); } } void GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, const uint8_t* x) { - DeviceScope scope(config_.device); + DeviceScope scope(binaryFlatConfig_.device); // To avoid multiple re-allocations, ensure we have enough storage // available - data_->reserve(n, resources_->getDefaultStream(config_.device)); + data_->reserve(n, resources_->getDefaultStream(binaryFlatConfig_.device)); // Due to GPU indexing in int32, we can't store more than this // number of vectors on a GPU @@ -123,13 +128,13 @@ GpuIndexBinaryFlat::add(faiss::IndexBinary::idx_t n, data_->add((const unsigned char*) x, n, - resources_->getDefaultStream(config_.device)); + resources_->getDefaultStream(binaryFlatConfig_.device)); this->ntotal += n; } void GpuIndexBinaryFlat::reset() { - DeviceScope scope(config_.device); + DeviceScope scope(binaryFlatConfig_.device); // Free the underlying memory data_->reset(); @@ -155,8 +160,8 @@ GpuIndexBinaryFlat::search(faiss::IndexBinary::idx_t n, getMaxKSelection(), (int) k); // select limitation - DeviceScope scope(config_.device); - auto stream = resources_->getDefaultStream(config_.device); + DeviceScope scope(binaryFlatConfig_.device); + auto stream = resources_->getDefaultStream(binaryFlatConfig_.device); // The input vectors may be too large for the GPU, but we still // assume that the output distances and labels are not. @@ -165,7 +170,7 @@ GpuIndexBinaryFlat::search(faiss::IndexBinary::idx_t n, // If we reach a point where all inputs are too big, we can add // another level of tiling. auto outDistances = toDeviceTemporary(resources_.get(), - config_.device, + binaryFlatConfig_.device, distances, stream, {(int) n, (int) k}); @@ -203,7 +208,7 @@ GpuIndexBinaryFlat::search(faiss::IndexBinary::idx_t n, // Convert and copy int indices out auto outIndices = toDeviceTemporary(resources_.get(), - config_.device, + binaryFlatConfig_.device, labels, stream, {(int) n, (int) k}); @@ -227,12 +232,12 @@ GpuIndexBinaryFlat::searchNonPaged_(int n, Tensor outDistances(outDistancesData, {n, k}); Tensor outIndices(outIndicesData, {n, k}); - auto stream = resources_->getDefaultStream(config_.device); + auto stream = resources_->getDefaultStream(binaryFlatConfig_.device); // Make sure arguments are on the device we desire; use temporary // memory allocations to move it if necessary auto vecs = toDeviceTemporary(resources_.get(), - config_.device, + binaryFlatConfig_.device, const_cast(x), stream, {n, (int) (this->d / 8)}); @@ -272,10 +277,10 @@ GpuIndexBinaryFlat::searchFromCpuPaged_(int n, void GpuIndexBinaryFlat::reconstruct(faiss::IndexBinary::idx_t key, uint8_t* out) const { - DeviceScope scope(config_.device); + DeviceScope scope(binaryFlatConfig_.device); FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds"); - auto stream = resources_->getDefaultStream(config_.device); + auto stream = resources_->getDefaultStream(binaryFlatConfig_.device); auto& vecs = data_->getVectorsRef(); auto vec = vecs[key]; diff --git a/faiss/gpu/GpuIndexBinaryFlat.h b/faiss/gpu/GpuIndexBinaryFlat.h index dbb475a76..bafff3569 100644 --- a/faiss/gpu/GpuIndexBinaryFlat.h +++ b/faiss/gpu/GpuIndexBinaryFlat.h @@ -38,6 +38,9 @@ class GpuIndexBinaryFlat : public IndexBinary { ~GpuIndexBinaryFlat() override; + /// Returns the device that this index is resident on + int getDevice() const; + /// Initialize ourselves from the given CPU index; will overwrite /// all data in ourselves void copyFrom(const faiss::IndexBinaryFlat* index); @@ -80,7 +83,7 @@ class GpuIndexBinaryFlat : public IndexBinary { std::shared_ptr resources_; /// Configuration options - GpuIndexBinaryFlatConfig config_; + const GpuIndexBinaryFlatConfig binaryFlatConfig_; /// Holds our GPU data containing the list of vectors std::unique_ptr data_; diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index 34f39d3b3..760ab6a21 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -27,7 +27,7 @@ GpuIndexFlat::GpuIndexFlat(GpuResourcesProvider* provider, index->metric_type, index->metric_arg, config), - config_(std::move(config)) { + flatConfig_(config) { // Flat index doesn't need training this->is_trained = true; @@ -42,7 +42,7 @@ GpuIndexFlat::GpuIndexFlat(std::shared_ptr resources, index->metric_type, index->metric_arg, config), - config_(std::move(config)) { + flatConfig_(config) { // Flat index doesn't need training this->is_trained = true; @@ -58,17 +58,17 @@ GpuIndexFlat::GpuIndexFlat(GpuResourcesProvider* provider, metric, 0, config), - config_(std::move(config)) { + flatConfig_(config) { // Flat index doesn't need training this->is_trained = true; // Construct index - DeviceScope scope(device_); + DeviceScope scope(config_.device); data_.reset(new FlatIndex(resources_.get(), dims, - config_.useFloat16, - config_.storeTransposed, - memorySpace_)); + flatConfig_.useFloat16, + flatConfig_.storeTransposed, + config_.memorySpace)); } GpuIndexFlat::GpuIndexFlat(std::shared_ptr resources, @@ -76,17 +76,17 @@ GpuIndexFlat::GpuIndexFlat(std::shared_ptr resources, faiss::MetricType metric, GpuIndexFlatConfig config) : GpuIndex(resources, dims, metric, 0, config), - config_(std::move(config)) { + flatConfig_(config) { // Flat index doesn't need training this->is_trained = true; // Construct index - DeviceScope scope(device_); + DeviceScope scope(config_.device); data_.reset(new FlatIndex(resources_.get(), dims, - config_.useFloat16, - config_.storeTransposed, - memorySpace_)); + flatConfig_.useFloat16, + flatConfig_.storeTransposed, + config_.memorySpace)); } GpuIndexFlat::~GpuIndexFlat() { @@ -94,7 +94,7 @@ GpuIndexFlat::~GpuIndexFlat() { void GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); GpuIndex::copyFrom(index); @@ -109,21 +109,21 @@ GpuIndexFlat::copyFrom(const faiss::IndexFlat* index) { data_.reset(); data_.reset(new FlatIndex(resources_.get(), this->d, - config_.useFloat16, - config_.storeTransposed, - memorySpace_)); + flatConfig_.useFloat16, + flatConfig_.storeTransposed, + config_.memorySpace)); // The index could be empty if (index->ntotal > 0) { data_->add(index->xb.data(), index->ntotal, - resources_->getDefaultStream(device_)); + resources_->getDefaultStream(config_.device)); } } void GpuIndexFlat::copyTo(faiss::IndexFlat* index) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); GpuIndex::copyTo(index); @@ -131,10 +131,10 @@ GpuIndexFlat::copyTo(faiss::IndexFlat* index) const { FAISS_ASSERT(data_->getSize() == this->ntotal); index->xb.resize(this->ntotal * this->d); - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); if (this->ntotal > 0) { - if (config_.useFloat16) { + if (flatConfig_.useFloat16) { auto vecFloat32 = data_->getVectorsFloat32Copy(stream); fromDevice(vecFloat32, index->xb.data(), stream); } else { @@ -150,7 +150,7 @@ GpuIndexFlat::getNumVecs() const { void GpuIndexFlat::reset() { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // Free the underlying memory data_->reset(); @@ -176,15 +176,15 @@ GpuIndexFlat::add(Index::idx_t n, const float* x) { return; } - DeviceScope scope(device_); + DeviceScope scope(config_.device); // To avoid multiple re-allocations, ensure we have enough storage // available - data_->reserve(n, resources_->getDefaultStream(device_)); + data_->reserve(n, resources_->getDefaultStream(config_.device)); // If we're not operating in float16 mode, we don't need the input // data to be resident on our device; we can add directly. - if (!config_.useFloat16) { + if (!flatConfig_.useFloat16) { addImpl_(n, x, nullptr); } else { // Otherwise, perform the paging @@ -214,7 +214,7 @@ GpuIndexFlat::addImpl_(int n, "GPU index only supports up to %zu indices", (size_t) std::numeric_limits::max()); - data_->add(x, n, resources_->getDefaultStream(device_)); + data_->add(x, n, resources_->getDefaultStream(config_.device)); this->ntotal += n; } @@ -224,7 +224,7 @@ GpuIndexFlat::searchImpl_(int n, int k, float* distances, Index::idx_t* labels) const { - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); // Input and output data are already resident on the GPU Tensor queries(const_cast(x), {n, (int) this->d}); @@ -249,12 +249,12 @@ GpuIndexFlat::searchImpl_(int n, void GpuIndexFlat::reconstruct(faiss::Index::idx_t key, float* out) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); FAISS_THROW_IF_NOT_MSG(key < this->ntotal, "index out of bounds"); - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); - if (config_.useFloat16) { + if (flatConfig_.useFloat16) { // FIXME jhj: kernel for copy auto vec = data_->getVectorsFloat32Copy(key, 1, stream); fromDevice(vec.data(), out, this->d, stream); @@ -268,13 +268,13 @@ void GpuIndexFlat::reconstruct_n(faiss::Index::idx_t i0, faiss::Index::idx_t num, float* out) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); FAISS_THROW_IF_NOT_MSG(i0 < this->ntotal, "index out of bounds"); FAISS_THROW_IF_NOT_MSG(i0 + num - 1 < this->ntotal, "num out of bounds"); - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); - if (config_.useFloat16) { + if (flatConfig_.useFloat16) { // FIXME jhj: kernel for copy auto vec = data_->getVectorsFloat32Copy(i0, num, stream); fromDevice(vec.data(), out, num * this->d, stream); @@ -301,23 +301,24 @@ GpuIndexFlat::compute_residual_n(faiss::Index::idx_t n, "GPU index only supports up to %zu indices", (size_t) std::numeric_limits::max()); - auto stream = resources_->getDefaultStream(device_); + auto stream = resources_->getDefaultStream(config_.device); - DeviceScope scope(device_); + DeviceScope scope(config_.device); auto vecsDevice = - toDeviceTemporary(resources_.get(), device_, + toDeviceTemporary(resources_.get(), config_.device, const_cast(xs), stream, {(int) n, (int) this->d}); auto idsDevice = toDeviceTemporary( - resources_.get(), device_, + resources_.get(), config_.device, const_cast(keys), stream, {(int) n}); auto residualDevice = - toDeviceTemporary(resources_.get(), device_, residuals, stream, + toDeviceTemporary(resources_.get(), + config_.device, residuals, stream, {(int) n, (int) this->d}); // Convert idx_t to int diff --git a/faiss/gpu/GpuIndexFlat.h b/faiss/gpu/GpuIndexFlat.h index 880f9c6d8..b06f42a7f 100644 --- a/faiss/gpu/GpuIndexFlat.h +++ b/faiss/gpu/GpuIndexFlat.h @@ -129,8 +129,8 @@ class GpuIndexFlat : public GpuIndex { faiss::Index::idx_t* labels) const override; protected: - /// Our config object - const GpuIndexFlatConfig config_; + /// Our configuration options + const GpuIndexFlatConfig flatConfig_; /// Holds our GPU data containing the list of vectors std::unique_ptr data_; diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index 47900e5e9..7d15fed02 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -22,12 +22,11 @@ GpuIndexIVF::GpuIndexIVF(GpuResourcesProvider* provider, float metricArg, int nlistIn, GpuIndexIVFConfig config) : - GpuIndex(provider->getResources(), - dims, metric, metricArg, config), + GpuIndex(provider->getResources(), dims, metric, metricArg, config), nlist(nlistIn), nprobe(1), quantizer(nullptr), - ivfConfig_(std::move(config)) { + ivfConfig_(config) { init_(); // Only IP and L2 are supported for now @@ -55,7 +54,7 @@ GpuIndexIVF::init_() { // Construct an empty quantizer GpuIndexFlatConfig config = ivfConfig_.flatConfig; // FIXME: inherit our same device - config.device = device_; + config.device = config_.device; if (metric_type == faiss::METRIC_L2) { quantizer = new GpuIndexFlatL2(resources_, d, config); @@ -79,7 +78,7 @@ GpuIndexIVF::getQuantizer() { void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); GpuIndex::copyFrom(index); @@ -105,7 +104,7 @@ GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) { // Construct an empty quantizer GpuIndexFlatConfig config = ivfConfig_.flatConfig; // FIXME: inherit our same device - config.device = device_; + config.device = config_.device; if (index->metric_type == faiss::METRIC_L2) { // FIXME: 2 different float16 options? @@ -143,7 +142,7 @@ GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) { void GpuIndexIVF::copyTo(faiss::IndexIVF* index) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // // Index information @@ -228,7 +227,7 @@ GpuIndexIVF::trainQuantizer_(faiss::Index::idx_t n, const float* x) { printf ("Training IVF quantizer on %ld vectors in %dD\n", n, d); } - DeviceScope scope(device_); + DeviceScope scope(config_.device); // leverage the CPU-side k-means code, which works for the GPU // flat index as well diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h index 8c4ad941d..fb731d954 100644 --- a/faiss/gpu/GpuIndexIVF.h +++ b/faiss/gpu/GpuIndexIVF.h @@ -83,7 +83,8 @@ class GpuIndexIVF : public GpuIndex { GpuIndexFlat* quantizer; protected: - GpuIndexIVFConfig ivfConfig_; + /// Our configuration options + const GpuIndexIVFConfig ivfConfig_; }; } } // namespace diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index ba7936219..bac5afb03 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -57,14 +57,14 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { reserveMemoryVecs_ = numVecs; if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reserveMemory(numVecs); } } void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); GpuIndexIVF::copyFrom(index); @@ -88,7 +88,7 @@ GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { false, // no residual nullptr, // no scalar quantizer ivfFlatConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); @@ -96,7 +96,7 @@ GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { void GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // We must have the indices in order to copy to ourselves FAISS_THROW_IF_NOT_MSG(ivfFlatConfig_.indicesOptions != INDICES_IVF, @@ -118,7 +118,7 @@ GpuIndexIVFFlat::copyTo(faiss::IndexIVFFlat* index) const { size_t GpuIndexIVFFlat::reclaimMemory() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); return index_->reclaimMemory(); } @@ -129,7 +129,7 @@ GpuIndexIVFFlat::reclaimMemory() { void GpuIndexIVFFlat::reset() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reset(); this->ntotal = 0; @@ -140,7 +140,7 @@ GpuIndexIVFFlat::reset() { void GpuIndexIVFFlat::train(Index::idx_t n, const float* x) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); if (this->is_trained) { FAISS_ASSERT(quantizer->is_trained); @@ -161,7 +161,7 @@ GpuIndexIVFFlat::train(Index::idx_t n, const float* x) { false, // no residual nullptr, // no scalar quantizer ivfFlatConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index 73a4b604d..17ed50573 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -73,8 +73,9 @@ class GpuIndexIVFFlat : public GpuIndexIVF { float* distances, Index::idx_t* labels) const override; - private: - GpuIndexIVFFlatConfig ivfFlatConfig_; + protected: + /// Our configuration options + const GpuIndexIVFFlatConfig ivfFlatConfig_; /// Desired inverted list memory reservation size_t reserveMemoryVecs_; diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu index 28b8e3be9..47715e2f9 100644 --- a/faiss/gpu/GpuIndexIVFPQ.cu +++ b/faiss/gpu/GpuIndexIVFPQ.cu @@ -30,6 +30,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResourcesProvider* provider, index->nlist, config), ivfpqConfig_(config), + usePrecomputedTables_(config.usePrecomputedTables), subQuantizers_(0), bitsPerCode_(0), reserveMemoryVecs_(0) { @@ -50,6 +51,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(GpuResourcesProvider* provider, nlist, config), ivfpqConfig_(config), + usePrecomputedTables_(config.usePrecomputedTables), subQuantizers_(subQuantizers), bitsPerCode_(bitsPerCode), reserveMemoryVecs_(0) { @@ -64,7 +66,7 @@ GpuIndexIVFPQ::~GpuIndexIVFPQ() { void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); GpuIndexIVF::copyFrom(index); @@ -105,9 +107,9 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { ivfpqConfig_.alternativeLayout, (float*) index->pq.centroids.data(), ivfpqConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); // Doesn't make sense to reserve memory here - index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables); + index_->setPrecomputedCodes(usePrecomputedTables_); // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); @@ -115,7 +117,7 @@ GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { void GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // We must have the indices in order to copy to ourselves FAISS_THROW_IF_NOT_MSG(ivfpqConfig_.indicesOptions != INDICES_IVF, @@ -153,9 +155,9 @@ GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const { fromDevice(devPQCentroids, index->pq.centroids.data(), - resources_->getDefaultStream(device_)); + resources_->getDefaultStream(config_.device)); - if (ivfpqConfig_.usePrecomputedTables) { + if (usePrecomputedTables_) { index->precompute_table(); } } @@ -165,16 +167,16 @@ void GpuIndexIVFPQ::reserveMemory(size_t numVecs) { reserveMemoryVecs_ = numVecs; if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reserveMemory(numVecs); } } void GpuIndexIVFPQ::setPrecomputedCodes(bool enable) { - ivfpqConfig_.usePrecomputedTables = enable; + usePrecomputedTables_ = enable; if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->setPrecomputedCodes(enable); } @@ -183,7 +185,7 @@ GpuIndexIVFPQ::setPrecomputedCodes(bool enable) { bool GpuIndexIVFPQ::getPrecomputedCodes() const { - return ivfpqConfig_.usePrecomputedTables; + return usePrecomputedTables_; } int @@ -204,7 +206,7 @@ GpuIndexIVFPQ::getCentroidsPerSubQuantizer() const { size_t GpuIndexIVFPQ::reclaimMemory() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); return index_->reclaimMemory(); } @@ -214,7 +216,7 @@ GpuIndexIVFPQ::reclaimMemory() { void GpuIndexIVFPQ::reset() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reset(); this->ntotal = 0; @@ -264,17 +266,17 @@ GpuIndexIVFPQ::trainResidualQuantizer_(Index::idx_t n, const float* x) { ivfpqConfig_.alternativeLayout, pq.centroids.data(), ivfpqConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); } - index_->setPrecomputedCodes(ivfpqConfig_.usePrecomputedTables); + index_->setPrecomputedCodes(usePrecomputedTables_); } void GpuIndexIVFPQ::train(Index::idx_t n, const float* x) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); if (this->is_trained) { FAISS_ASSERT(quantizer->is_trained); @@ -289,7 +291,7 @@ GpuIndexIVFPQ::train(Index::idx_t n, const float* x) { // First, make sure that the data is resident on the CPU, if it is not on the // CPU, as we depend upon parts of the CPU code auto hostData = toHost((float*) x, - resources_->getDefaultStream(device_), + resources_->getDefaultStream(config_.device), {(int) n, (int) this->d}); trainQuantizer_(n, hostData.data()); @@ -351,7 +353,7 @@ GpuIndexIVFPQ::getListLength(int listId) const { std::vector GpuIndexIVFPQ::getListCodes(int listId) const { FAISS_ASSERT(index_); - DeviceScope scope(device_); + DeviceScope scope(config_.device); return index_->getListCodes(listId); } @@ -359,7 +361,7 @@ GpuIndexIVFPQ::getListCodes(int listId) const { std::vector GpuIndexIVFPQ::getListIndices(int listId) const { FAISS_ASSERT(index_); - DeviceScope scope(device_); + DeviceScope scope(config_.device); return index_->getListIndices(listId); } @@ -398,15 +400,15 @@ GpuIndexIVFPQ::verifySettings_() const { // codes per subquantizer size_t requiredSmemSize = lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_); - size_t smemPerBlock = getMaxSharedMemPerBlock(device_); + size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device); FAISS_THROW_IF_NOT_FMT(requiredSmemSize - <= getMaxSharedMemPerBlock(device_), + <= getMaxSharedMemPerBlock(config_.device), "Device %d has %zu bytes of shared memory, while " "%d bits per code and %d sub-quantizers requires %zu " "bytes. Consider useFloat16LookupTables and/or " "reduce parameters", - device_, smemPerBlock, bitsPerCode_, subQuantizers_, + config_.device, smemPerBlock, bitsPerCode_, subQuantizers_, requiredSmemSize); } diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h index 4330eef03..373269598 100644 --- a/faiss/gpu/GpuIndexIVFPQ.h +++ b/faiss/gpu/GpuIndexIVFPQ.h @@ -135,13 +135,16 @@ class GpuIndexIVFPQ : public GpuIndexIVF { float* distances, Index::idx_t* labels) const override; - private: void verifySettings_() const; void trainResidualQuantizer_(Index::idx_t n, const float* x); - private: - GpuIndexIVFPQConfig ivfpqConfig_; + protected: + /// Our configuration options that we were initialized with + const GpuIndexIVFPQConfig ivfpqConfig_; + + /// Runtime override: whether or not we use precomputed tables + bool usePrecomputedTables_; /// Number of sub-quantizers per encoded vector int subQuantizers_; diff --git a/faiss/gpu/GpuIndexIVFScalarQuantizer.cu b/faiss/gpu/GpuIndexIVFScalarQuantizer.cu index 75d590c47..b35780ee5 100644 --- a/faiss/gpu/GpuIndexIVFScalarQuantizer.cu +++ b/faiss/gpu/GpuIndexIVFScalarQuantizer.cu @@ -66,7 +66,7 @@ void GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) { reserveMemoryVecs_ = numVecs; if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reserveMemory(numVecs); } } @@ -74,7 +74,7 @@ GpuIndexIVFScalarQuantizer::reserveMemory(size_t numVecs) { void GpuIndexIVFScalarQuantizer::copyFrom( const faiss::IndexIVFScalarQuantizer* index) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // Clear out our old data index_.reset(); @@ -101,7 +101,7 @@ GpuIndexIVFScalarQuantizer::copyFrom( by_residual, &sq, ivfSQConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); // Copy all of the IVF data index_->copyInvertedListsFrom(index->invlists); @@ -110,7 +110,7 @@ GpuIndexIVFScalarQuantizer::copyFrom( void GpuIndexIVFScalarQuantizer::copyTo( faiss::IndexIVFScalarQuantizer* index) const { - DeviceScope scope(device_); + DeviceScope scope(config_.device); // We must have the indices in order to copy to ourselves FAISS_THROW_IF_NOT_MSG( @@ -135,7 +135,7 @@ GpuIndexIVFScalarQuantizer::copyTo( size_t GpuIndexIVFScalarQuantizer::reclaimMemory() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); return index_->reclaimMemory(); } @@ -146,7 +146,7 @@ GpuIndexIVFScalarQuantizer::reclaimMemory() { void GpuIndexIVFScalarQuantizer::reset() { if (index_) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); index_->reset(); this->ntotal = 0; @@ -163,7 +163,7 @@ GpuIndexIVFScalarQuantizer::trainResiduals_(Index::idx_t n, const float* x) { void GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) { - DeviceScope scope(device_); + DeviceScope scope(config_.device); if (this->is_trained) { FAISS_ASSERT(quantizer->is_trained); @@ -178,7 +178,7 @@ GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) { // First, make sure that the data is resident on the CPU, if it is not on the // CPU, as we depend upon parts of the CPU code auto hostData = toHost((float*) x, - resources_->getDefaultStream(device_), + resources_->getDefaultStream(config_.device), {(int) n, (int) this->d}); trainQuantizer_(n, hostData.data()); @@ -192,7 +192,7 @@ GpuIndexIVFScalarQuantizer::train(Index::idx_t n, const float* x) { by_residual, &sq, ivfSQConfig_.indicesOptions, - memorySpace_)); + config_.memorySpace)); if (reserveMemoryVecs_) { index_->reserveMemory(reserveMemoryVecs_); diff --git a/faiss/gpu/GpuIndexIVFScalarQuantizer.h b/faiss/gpu/GpuIndexIVFScalarQuantizer.h index 63d5a12ab..67a1cc4e5 100644 --- a/faiss/gpu/GpuIndexIVFScalarQuantizer.h +++ b/faiss/gpu/GpuIndexIVFScalarQuantizer.h @@ -88,8 +88,9 @@ class GpuIndexIVFScalarQuantizer : public GpuIndexIVF { /// Exposed like the CPU version bool by_residual; - private: - GpuIndexIVFScalarQuantizerConfig ivfSQConfig_; +protected: + /// Our configuration options + const GpuIndexIVFScalarQuantizerConfig ivfSQConfig_; /// Desired inverted list memory reservation size_t reserveMemoryVecs_; diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index 46340dc86..90d983c60 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -101,12 +101,8 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() { for (auto& entry : defaultStreams_) { DeviceScope scope(entry.first); - auto it = userDefaultStreams_.find(entry.first); - if (it == userDefaultStreams_.end()) { - // The user did not specify this stream, thus we are the ones - // who have created it - CUDA_VERIFY(cudaStreamDestroy(entry.second)); - } + // We created these streams, so are responsible for destroying them + CUDA_VERIFY(cudaStreamDestroy(entry.second)); } for (auto& entry : alternateStreams_) { @@ -210,16 +206,14 @@ StandardGpuResourcesImpl::setPinnedMemory(size_t size) { void StandardGpuResourcesImpl::setDefaultStream(int device, cudaStream_t stream) { - auto it = defaultStreams_.find(device); - if (it != defaultStreams_.end()) { - // Replace this stream with the user stream - CUDA_VERIFY(cudaStreamDestroy(it->second)); - it->second = stream; - } - userDefaultStreams_[device] = stream; } +void +StandardGpuResourcesImpl::revertDefaultStream(int device) { + userDefaultStreams_.erase(device); +} + void StandardGpuResourcesImpl::setDefaultNullStreamAllDevices() { for (int dev = 0; dev < getNumDevices(); ++dev) { @@ -274,14 +268,8 @@ StandardGpuResourcesImpl::initializeForDevice(int device) { // Create streams cudaStream_t defaultStream = 0; - auto it = userDefaultStreams_.find(device); - if (it != userDefaultStreams_.end()) { - // We already have a stream provided by the user - defaultStream = it->second; - } else { - CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, - cudaStreamNonBlocking)); - } + CUDA_VERIFY(cudaStreamCreateWithFlags(&defaultStream, + cudaStreamNonBlocking)); defaultStreams_[device] = defaultStream; @@ -341,6 +329,14 @@ StandardGpuResourcesImpl::getBlasHandle(int device) { cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) { initializeForDevice(device); + + auto it = userDefaultStreams_.find(device); + if (it != userDefaultStreams_.end()) { + // There is a user override stream set + return it->second; + } + + // Otherwise, our base default stream return defaultStreams_[device]; } @@ -539,6 +535,11 @@ StandardGpuResources::setDefaultStream(int device, cudaStream_t stream) { res_->setDefaultStream(device, stream); } +void +StandardGpuResources::revertDefaultStream(int device) { + res_->revertDefaultStream(device); +} + void StandardGpuResources::setDefaultNullStreamAllDevices() { res_->setDefaultNullStreamAllDevices(); diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 34b0b88ff..c6b46183f 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -41,9 +41,23 @@ class StandardGpuResourcesImpl : public GpuResources { /// transfers void setPinnedMemory(size_t size); - /// Called to change the stream for work ordering + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. void setDefaultStream(int device, cudaStream_t stream); + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device); + + /// Returns the stream for the given device on which all Faiss GPU work is + /// ordered. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. + cudaStream_t getDefaultStream(int device) override; + /// Called to change the work ordering streams to the null stream /// for all devices void setDefaultNullStreamAllDevices(); @@ -60,8 +74,6 @@ class StandardGpuResourcesImpl : public GpuResources { cublasHandle_t getBlasHandle(int device) override; - cudaStream_t getDefaultStream(int device) override; - std::vector getAlternateStreams(int device) override; /// Allocate non-temporary GPU memory @@ -128,7 +140,9 @@ class StandardGpuResourcesImpl : public GpuResources { }; /// Default implementation of GpuResources that allocates a cuBLAS -/// stream and 2 streams for use, as well as temporary memory +/// stream and 2 streams for use, as well as temporary memory. +/// Internally, the Faiss GPU code uses the instance managed by getResources, +/// but this is the user-facing object that is internally reference counted. class StandardGpuResources : public GpuResourcesProvider { public: StandardGpuResources(); @@ -151,9 +165,17 @@ class StandardGpuResources : public GpuResourcesProvider { /// transfers void setPinnedMemory(size_t size); - /// Called to change the stream for work ordering + /// Called to change the stream for work ordering. We do not own `stream`; + /// i.e., it will not be destroyed when the GpuResources object gets cleaned + /// up. + /// We are guaranteed that all Faiss GPU work is ordered with respect to + /// this stream upon exit from an index or other Faiss GPU call. void setDefaultStream(int device, cudaStream_t stream); + /// Revert the default stream to the original stream managed by this resources + /// object, in case someone called `setDefaultStream`. + void revertDefaultStream(int device); + /// Called to change the work ordering streams to the null stream /// for all devices void setDefaultNullStreamAllDevices(); diff --git a/faiss/gpu/test/test_pytorch_faiss.py b/faiss/gpu/test/test_pytorch_faiss.py index ccc459870..fdee8dee1 100644 --- a/faiss/gpu/test/test_pytorch_faiss.py +++ b/faiss/gpu/test/test_pytorch_faiss.py @@ -10,7 +10,7 @@ import unittest import faiss import torch -from faiss.contrib.pytorch_tensors import search_index_pytorch, search_raw_array_pytorch +from faiss.contrib.pytorch_tensors import search_index_pytorch, search_raw_array_pytorch, using_stream def to_column_major(x): if hasattr(torch, 'contiguous_format'): @@ -22,40 +22,38 @@ def to_column_major(x): class PytorchFaissInterop(unittest.TestCase): def test_interop(self): - - d = 16 - nq = 5 - nb = 20 + d = 128 + nq = 100 + nb = 1000 + k = 10 xq = faiss.randn(nq * d, 1234).reshape(nq, d) xb = faiss.randn(nb * d, 1235).reshape(nb, d) res = faiss.StandardGpuResources() - index = faiss.GpuIndexFlatIP(res, d) - index.add(xb) - # reference CPU result - Dref, Iref = index.search(xq, 5) + # Let's run on a non-default stream + s = torch.cuda.Stream() - # query is pytorch tensor (CPU) - xq_torch = torch.FloatTensor(xq) + # Torch will run on this stream + with torch.cuda.stream(s): + # query is pytorch tensor (CPU and GPU) + xq_torch_cpu = torch.FloatTensor(xq) + xq_torch_gpu = xq_torch_cpu.cuda() - D2, I2 = search_index_pytorch(index, xq_torch, 5) + index = faiss.GpuIndexFlatIP(res, d) + index.add(xb) - assert np.all(Iref == I2.numpy()) + # Query with GPU tensor (this will be done on the current pytorch stream) + D2, I2 = search_index_pytorch(res, index, xq_torch_gpu, k) + Dref, Iref = index.search(xq, k) - # query is pytorch tensor (GPU) - xq_torch = xq_torch.cuda() - # no need for a sync here + assert np.all(Iref == I2.cpu().numpy()) - D3, I3 = search_index_pytorch(index, xq_torch, 5) + # Query with CPU tensor + D3, I3 = search_index_pytorch(res, index, xq_torch_cpu, k) - # D3 and I3 are on torch tensors on GPU as well. - # this does a sync, which is useful because faiss and - # pytorch use different Cuda streams. - res.syncDefaultStreamCurrentDevice() - - assert np.all(Iref == I3.cpu().numpy()) + assert np.all(Iref == I3.numpy()) def test_raw_array_search(self): d = 32 @@ -74,55 +72,51 @@ class PytorchFaissInterop(unittest.TestCase): # resource object, can be re-used over calls res = faiss.StandardGpuResources() - # put on same stream as pytorch to avoid synchronizing streams - res.setDefaultNullStreamAllDevices() - for xq_row_major in True, False: - for xb_row_major in True, False: + # Let's have pytorch use a non-default stream + s = torch.cuda.Stream() + with torch.cuda.stream(s): + for xq_row_major in True, False: + for xb_row_major in True, False: - # move to pytorch & GPU - xq_t = torch.from_numpy(xq).cuda() - xb_t = torch.from_numpy(xb).cuda() + # move to pytorch & GPU + xq_t = torch.from_numpy(xq).cuda() + xb_t = torch.from_numpy(xb).cuda() - if not xq_row_major: - xq_t = to_column_major(xq_t) - assert not xq_t.is_contiguous() - - if not xb_row_major: - xb_t = to_column_major(xb_t) - assert not xb_t.is_contiguous() - - D, I = search_raw_array_pytorch(res, xb_t, xq_t, k) - - # back to CPU for verification - D = D.cpu().numpy() - I = I.cpu().numpy() - - assert np.all(I == gt_I) - assert np.all(np.abs(D - gt_D).max() < 1e-4) - - - - # test on subset - try: - D, I = search_raw_array_pytorch(res, xb_t, xq_t[60:80], k) - except TypeError: if not xq_row_major: - # then it is expected - continue - # otherwise it is an error - raise + xq_t = to_column_major(xq_t) + assert not xq_t.is_contiguous() - # back to CPU for verification - D = D.cpu().numpy() - I = I.cpu().numpy() + if not xb_row_major: + xb_t = to_column_major(xb_t) + assert not xb_t.is_contiguous() - assert np.all(I == gt_I[60:80]) - assert np.all(np.abs(D - gt_D[60:80]).max() < 1e-4) + D, I = search_raw_array_pytorch(res, xb_t, xq_t, k) + # back to CPU for verification + D = D.cpu().numpy() + I = I.cpu().numpy() + assert np.all(I == gt_I) + assert np.all(np.abs(D - gt_D).max() < 1e-4) + # test on subset + try: + # This internally uses the current pytorch stream + D, I = search_raw_array_pytorch(res, xb_t, xq_t[60:80], k) + except TypeError: + if not xq_row_major: + # then it is expected + continue + # otherwise it is an error + raise + # back to CPU for verification + D = D.cpu().numpy() + I = I.cpu().numpy() + + assert np.all(I == gt_I[60:80]) + assert np.all(np.abs(D - gt_D[60:80]).max() < 1e-4) if __name__ == '__main__': diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index 48bafbe3e..f3822e082 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -274,7 +274,6 @@ void gpu_sync_all_devices() %} - %template() std::pair; %template() std::map >; %template() std::map > >; @@ -287,6 +286,21 @@ void gpu_sync_all_devices() %include %include +typedef CUstream_st* cudaStream_t; + +%inline %{ + +// interop between pytorch exposed cudaStream_t and faiss +cudaStream_t cast_integer_to_cudastream_t(long x) { + return (cudaStream_t) x; +} + +long cast_cudastream_t_to_integer(cudaStream_t x) { + return (long) x; +} + +%} + #else %{