/** * Copyright (c) 2015-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the CC-by-NC license found in the * LICENSE file in the root directory of this source tree. */ // Copyright 2004-present Facebook. All Rights Reserved. #pragma once #include "../../FaissAssert.h" #include "Tensor.cuh" #include "DeviceUtils.h" #include #include namespace faiss { namespace gpu { template struct TensorInfo { static constexpr int kMaxDims = 8; T* data; int sizes[kMaxDims]; int strides[kMaxDims]; int dims; }; template struct TensorInfoOffset { __device__ inline static unsigned int get(const TensorInfo& info, unsigned int linearId) { unsigned int offset = 0; #pragma unroll for (int i = Dim - 1; i >= 0; --i) { unsigned int curDimIndex = linearId % info.sizes[i]; unsigned int curDimOffset = curDimIndex * info.strides[i]; offset += curDimOffset; if (i > 0) { linearId /= info.sizes[i]; } } return offset; } }; template struct TensorInfoOffset { __device__ inline static unsigned int get(const TensorInfo& info, unsigned int linearId) { return linearId; } }; template TensorInfo getTensorInfo(const Tensor& t) { TensorInfo info; for (int i = 0; i < Dim; ++i) { info.sizes[i] = t.getSize(i); info.strides[i] = t.getStride(i); } info.data = t.data(); info.dims = Dim; return info; } template __global__ void transposeAny(TensorInfo input, TensorInfo output, unsigned int totalSize) { auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x; if (linearThreadId >= totalSize) { return; } auto inputOffset = TensorInfoOffset::get(input, linearThreadId); auto outputOffset = TensorInfoOffset::get(output, linearThreadId); output.data[outputOffset] = __ldg(&input.data[inputOffset]); } /// Performs an out-of-place transposition between any two dimensions. /// Best performance is if the transposed dimensions are not /// innermost, since the reads and writes will be coalesced. /// Could include a shared memory transposition if the dimensions /// being transposed are innermost, but would require support for /// arbitrary rectangular matrices. /// This linearized implementation seems to perform well enough, /// especially for cases that we care about (outer dimension /// transpositions). template void runTransposeAny(Tensor& in, int dim1, int dim2, Tensor& out, cudaStream_t stream) { static_assert(Dim <= TensorInfo::kMaxDims, "too many dimensions"); FAISS_ASSERT(dim1 != dim2); FAISS_ASSERT(dim1 < Dim && dim2 < Dim); int outSize[Dim]; for (int i = 0; i < Dim; ++i) { outSize[i] = in.getSize(i); } std::swap(outSize[dim1], outSize[dim2]); for (int i = 0; i < Dim; ++i) { FAISS_ASSERT(out.getSize(i) == outSize[i]); } auto inInfo = getTensorInfo(in); auto outInfo = getTensorInfo(out); std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]); std::swap(inInfo.strides[dim1], inInfo.strides[dim2]); int totalSize = in.numElements(); int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize); auto grid = dim3(utils::divUp(totalSize, numThreads)); auto block = dim3(numThreads); transposeAny<<>>(inInfo, outInfo, totalSize); CUDA_VERIFY(cudaGetLastError()); } } } // namespace