/** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include #include #include #include #include namespace faiss { namespace gpu { template struct TensorInfo { static constexpr int kMaxDims = 8; T* data; IndexT sizes[kMaxDims]; IndexT strides[kMaxDims]; int dims; }; template struct TensorInfoOffset { __device__ inline static unsigned int get(const TensorInfo& info, IndexT linearId) { IndexT offset = 0; #pragma unroll for (int i = Dim - 1; i >= 0; --i) { IndexT curDimIndex = linearId % info.sizes[i]; IndexT curDimOffset = curDimIndex * info.strides[i]; offset += curDimOffset; if (i > 0) { linearId /= info.sizes[i]; } } return offset; } }; template struct TensorInfoOffset { __device__ inline static unsigned int get(const TensorInfo& info, IndexT linearId) { return linearId; } }; template TensorInfo getTensorInfo(const Tensor& t) { TensorInfo info; for (int i = 0; i < Dim; ++i) { info.sizes[i] = (IndexT) t.getSize(i); info.strides[i] = (IndexT) t.getStride(i); } info.data = t.data(); info.dims = Dim; return info; } template __global__ void transposeAny(TensorInfo input, TensorInfo output, IndexT totalSize) { for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x; i < totalSize; i += gridDim.x + blockDim.x) { auto inputOffset = TensorInfoOffset::get(input, i); auto outputOffset = TensorInfoOffset::get(output, i); #if __CUDA_ARCH__ >= 350 output.data[outputOffset] = __ldg(&input.data[inputOffset]); #else output.data[outputOffset] = input.data[inputOffset]; #endif } } /// Performs an out-of-place transposition between any two dimensions. /// Best performance is if the transposed dimensions are not /// innermost, since the reads and writes will be coalesced. /// Could include a shared memory transposition if the dimensions /// being transposed are innermost, but would require support for /// arbitrary rectangular matrices. /// This linearized implementation seems to perform well enough, /// especially for cases that we care about (outer dimension /// transpositions). template void runTransposeAny(Tensor& in, int dim1, int dim2, Tensor& out, cudaStream_t stream) { static_assert(Dim <= TensorInfo::kMaxDims, "too many dimensions"); FAISS_ASSERT(dim1 != dim2); FAISS_ASSERT(dim1 < Dim && dim2 < Dim); int outSize[Dim]; for (int i = 0; i < Dim; ++i) { outSize[i] = in.getSize(i); } std::swap(outSize[dim1], outSize[dim2]); for (int i = 0; i < Dim; ++i) { FAISS_ASSERT(out.getSize(i) == outSize[i]); } size_t totalSize = in.numElements(); size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize); if (totalSize <= (size_t) std::numeric_limits::max()) { // div/mod seems faster with unsigned types auto inInfo = getTensorInfo(in); auto outInfo = getTensorInfo(out); std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]); std::swap(inInfo.strides[dim1], inInfo.strides[dim2]); auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096); transposeAny <<>>(inInfo, outInfo, totalSize); } else { auto inInfo = getTensorInfo(in); auto outInfo = getTensorInfo(out); std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]); std::swap(inInfo.strides[dim1], inInfo.strides[dim2]); auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096); transposeAny <<>>(inInfo, outInfo, totalSize); } CUDA_TEST_ERROR(); } } } // namespace