mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
various bugfixes from github issues kmean with some frozen centroids GPU better tiling for large flat datasets default AVX for vector ops
158 lines
4.5 KiB
Plaintext
158 lines
4.5 KiB
Plaintext
/**
|
|
* Copyright (c) 2015-present, Facebook, Inc.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under the BSD+Patents license found in the
|
|
* LICENSE file in the root directory of this source tree.
|
|
*/
|
|
|
|
// Copyright 2004-present Facebook. All Rights Reserved.
|
|
|
|
#pragma once
|
|
|
|
#include "../../FaissAssert.h"
|
|
#include "Tensor.cuh"
|
|
#include "DeviceUtils.h"
|
|
#include <cuda.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
namespace faiss { namespace gpu {
|
|
|
|
template <typename T, typename IndexT>
|
|
struct TensorInfo {
|
|
static constexpr int kMaxDims = 8;
|
|
|
|
T* data;
|
|
IndexT sizes[kMaxDims];
|
|
IndexT strides[kMaxDims];
|
|
int dims;
|
|
};
|
|
|
|
template <typename T, typename IndexT, int Dim>
|
|
struct TensorInfoOffset {
|
|
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
|
|
IndexT linearId) {
|
|
IndexT offset = 0;
|
|
|
|
#pragma unroll
|
|
for (int i = Dim - 1; i >= 0; --i) {
|
|
IndexT curDimIndex = linearId % info.sizes[i];
|
|
IndexT curDimOffset = curDimIndex * info.strides[i];
|
|
|
|
offset += curDimOffset;
|
|
|
|
if (i > 0) {
|
|
linearId /= info.sizes[i];
|
|
}
|
|
}
|
|
|
|
return offset;
|
|
}
|
|
};
|
|
|
|
template <typename T, typename IndexT>
|
|
struct TensorInfoOffset<T, IndexT, -1> {
|
|
__device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
|
|
IndexT linearId) {
|
|
return linearId;
|
|
}
|
|
};
|
|
|
|
template <typename T, typename IndexT, int Dim>
|
|
TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
|
|
TensorInfo<T, IndexT> info;
|
|
|
|
for (int i = 0; i < Dim; ++i) {
|
|
info.sizes[i] = (IndexT) t.getSize(i);
|
|
info.strides[i] = (IndexT) t.getStride(i);
|
|
}
|
|
|
|
info.data = t.data();
|
|
info.dims = Dim;
|
|
|
|
return info;
|
|
}
|
|
|
|
template <typename T, typename IndexT, int DimInput, int DimOutput>
|
|
__global__ void transposeAny(TensorInfo<T, IndexT> input,
|
|
TensorInfo<T, IndexT> output,
|
|
IndexT totalSize) {
|
|
for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < totalSize;
|
|
i += gridDim.x + blockDim.x) {
|
|
auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
|
|
auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
|
|
|
|
#if __CUDA_ARCH__ >= 350
|
|
output.data[outputOffset] = __ldg(&input.data[inputOffset]);
|
|
#else
|
|
output.data[outputOffset] = input.data[inputOffset];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
/// Performs an out-of-place transposition between any two dimensions.
|
|
/// Best performance is if the transposed dimensions are not
|
|
/// innermost, since the reads and writes will be coalesced.
|
|
/// Could include a shared memory transposition if the dimensions
|
|
/// being transposed are innermost, but would require support for
|
|
/// arbitrary rectangular matrices.
|
|
/// This linearized implementation seems to perform well enough,
|
|
/// especially for cases that we care about (outer dimension
|
|
/// transpositions).
|
|
template <typename T, int Dim>
|
|
void runTransposeAny(Tensor<T, Dim, true>& in,
|
|
int dim1, int dim2,
|
|
Tensor<T, Dim, true>& out,
|
|
cudaStream_t stream) {
|
|
static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
|
|
"too many dimensions");
|
|
|
|
FAISS_ASSERT(dim1 != dim2);
|
|
FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
|
|
|
|
int outSize[Dim];
|
|
|
|
for (int i = 0; i < Dim; ++i) {
|
|
outSize[i] = in.getSize(i);
|
|
}
|
|
|
|
std::swap(outSize[dim1], outSize[dim2]);
|
|
|
|
for (int i = 0; i < Dim; ++i) {
|
|
FAISS_ASSERT(out.getSize(i) == outSize[i]);
|
|
}
|
|
|
|
size_t totalSize = in.numElements();
|
|
size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
|
|
|
|
if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
|
|
// div/mod seems faster with unsigned types
|
|
auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
|
|
auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
|
|
|
|
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
|
|
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
|
|
|
|
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
|
|
|
|
transposeAny<T, unsigned int, Dim, -1>
|
|
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
|
|
} else {
|
|
auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
|
|
auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
|
|
|
|
std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
|
|
std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
|
|
|
|
auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
|
|
|
|
transposeAny<T, unsigned long, Dim, -1>
|
|
<<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
|
|
}
|
|
CUDA_TEST_ERROR();
|
|
}
|
|
|
|
} } // namespace
|