11 #include "../../FaissAssert.h"
13 #include "DeviceUtils.h"
14 #include "StaticUtils.h"
17 namespace faiss {
namespace gpu {
19 template <
typename T,
typename IndexT>
21 static constexpr
int kMaxDims = 8;
24 IndexT sizes[kMaxDims];
25 IndexT strides[kMaxDims];
29 template <
typename T,
typename IndexT,
int Dim>
36 for (
int i = Dim - 1; i >= 0; --i) {
37 IndexT curDimIndex = linearId % info.sizes[i];
38 IndexT curDimOffset = curDimIndex * info.strides[i];
40 offset += curDimOffset;
43 linearId /= info.sizes[i];
51 template <
typename T,
typename IndexT>
59 template <
typename T,
typename IndexT,
int Dim>
63 for (
int i = 0; i < Dim; ++i) {
64 info.sizes[i] = (IndexT) t.
getSize(i);
65 info.strides[i] = (IndexT) t.
getStride(i);
74 template <
typename T,
typename IndexT,
int DimInput,
int DimOutput>
75 __global__
void transposeAny(TensorInfo<T, IndexT> input,
76 TensorInfo<T, IndexT> output,
78 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
80 i += gridDim.x + blockDim.x) {
81 auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
82 auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
84 #if __CUDA_ARCH__ >= 350
85 output.data[outputOffset] = __ldg(&input.data[inputOffset]);
87 output.data[outputOffset] = input.data[inputOffset];
101 template <
typename T,
int Dim>
102 void runTransposeAny(Tensor<T, Dim, true>& in,
104 Tensor<T, Dim, true>& out,
105 cudaStream_t stream) {
106 static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
107 "too many dimensions");
109 FAISS_ASSERT(dim1 != dim2);
110 FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
114 for (
int i = 0; i < Dim; ++i) {
115 outSize[i] = in.getSize(i);
118 std::swap(outSize[dim1], outSize[dim2]);
120 for (
int i = 0; i < Dim; ++i) {
121 FAISS_ASSERT(out.getSize(i) == outSize[i]);
124 size_t totalSize = in.numElements();
125 size_t block = std::min((
size_t) getMaxThreadsCurrentDevice(), totalSize);
127 if (totalSize <= (
size_t) std::numeric_limits<int>::max()) {
129 auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
130 auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
132 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
133 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
135 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
137 transposeAny<T,
unsigned int, Dim, -1>
138 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
140 auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
141 auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
143 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
144 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
146 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
148 transposeAny<T,
unsigned long, Dim, -1>
149 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
__host__ __device__ IndexT getSize(int i) const
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
__host__ __device__ IndexT getStride(int i) const