13 #include "../../FaissAssert.h"
15 #include "DeviceUtils.h"
20 namespace faiss {
namespace gpu {
22 template <
typename T,
typename IndexT>
24 static constexpr
int kMaxDims = 8;
27 IndexT sizes[kMaxDims];
28 IndexT strides[kMaxDims];
32 template <
typename T,
typename IndexT,
int Dim>
39 for (
int i = Dim - 1; i >= 0; --i) {
40 IndexT curDimIndex = linearId % info.sizes[i];
41 IndexT curDimOffset = curDimIndex * info.strides[i];
43 offset += curDimOffset;
46 linearId /= info.sizes[i];
54 template <
typename T,
typename IndexT>
62 template <
typename T,
typename IndexT,
int Dim>
66 for (
int i = 0; i < Dim; ++i) {
67 info.sizes[i] = (IndexT) t.
getSize(i);
68 info.strides[i] = (IndexT) t.
getStride(i);
77 template <
typename T,
typename IndexT,
int DimInput,
int DimOutput>
78 __global__
void transposeAny(TensorInfo<T, IndexT> input,
79 TensorInfo<T, IndexT> output,
81 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
83 i += gridDim.x + blockDim.x) {
84 auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
85 auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
87 #if __CUDA_ARCH__ >= 350
88 output.data[outputOffset] = __ldg(&input.data[inputOffset]);
90 output.data[outputOffset] = input.data[inputOffset];
104 template <
typename T,
int Dim>
105 void runTransposeAny(Tensor<T, Dim, true>& in,
107 Tensor<T, Dim, true>& out,
108 cudaStream_t stream) {
109 static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
110 "too many dimensions");
112 FAISS_ASSERT(dim1 != dim2);
113 FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
117 for (
int i = 0; i < Dim; ++i) {
118 outSize[i] = in.getSize(i);
121 std::swap(outSize[dim1], outSize[dim2]);
123 for (
int i = 0; i < Dim; ++i) {
124 FAISS_ASSERT(out.getSize(i) == outSize[i]);
127 size_t totalSize = in.numElements();
128 size_t block = std::min((
size_t) getMaxThreadsCurrentDevice(), totalSize);
130 if (totalSize <= (
size_t) std::numeric_limits<int>::max()) {
132 auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
133 auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
135 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
136 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
138 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
140 transposeAny<T,
unsigned int, Dim, -1>
141 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
143 auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
144 auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
146 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
147 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
149 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
151 transposeAny<T,
unsigned long, Dim, -1>
152 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
__host__ __device__ IndexT getSize(int i) const
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
__host__ __device__ IndexT getStride(int i) const