12 #include "../../FaissAssert.h"
14 #include "DeviceUtils.h"
19 namespace faiss {
namespace gpu {
21 template <
typename T,
typename IndexT>
23 static constexpr
int kMaxDims = 8;
26 IndexT sizes[kMaxDims];
27 IndexT strides[kMaxDims];
31 template <
typename T,
typename IndexT,
int Dim>
38 for (
int i = Dim - 1; i >= 0; --i) {
39 IndexT curDimIndex = linearId % info.sizes[i];
40 IndexT curDimOffset = curDimIndex * info.strides[i];
42 offset += curDimOffset;
45 linearId /= info.sizes[i];
53 template <
typename T,
typename IndexT>
61 template <
typename T,
typename IndexT,
int Dim>
65 for (
int i = 0; i < Dim; ++i) {
66 info.sizes[i] = (IndexT) t.
getSize(i);
67 info.strides[i] = (IndexT) t.
getStride(i);
76 template <
typename T,
typename IndexT,
int DimInput,
int DimOutput>
77 __global__
void transposeAny(TensorInfo<T, IndexT> input,
78 TensorInfo<T, IndexT> output,
80 for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
82 i += gridDim.x + blockDim.x) {
83 auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
84 auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
86 #if __CUDA_ARCH__ >= 350
87 output.data[outputOffset] = __ldg(&input.data[inputOffset]);
89 output.data[outputOffset] = input.data[inputOffset];
103 template <
typename T,
int Dim>
104 void runTransposeAny(Tensor<T, Dim, true>& in,
106 Tensor<T, Dim, true>& out,
107 cudaStream_t stream) {
108 static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
109 "too many dimensions");
111 FAISS_ASSERT(dim1 != dim2);
112 FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
116 for (
int i = 0; i < Dim; ++i) {
117 outSize[i] = in.getSize(i);
120 std::swap(outSize[dim1], outSize[dim2]);
122 for (
int i = 0; i < Dim; ++i) {
123 FAISS_ASSERT(out.getSize(i) == outSize[i]);
126 size_t totalSize = in.numElements();
127 size_t block = std::min((
size_t) getMaxThreadsCurrentDevice(), totalSize);
129 if (totalSize <= (
size_t) std::numeric_limits<int>::max()) {
131 auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
132 auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
134 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
135 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
137 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
139 transposeAny<T,
unsigned int, Dim, -1>
140 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
142 auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
143 auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
145 std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
146 std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
148 auto grid = std::min(utils::divUp(totalSize, block), (
size_t) 4096);
150 transposeAny<T,
unsigned long, Dim, -1>
151 <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
__host__ __device__ IndexT getSize(int i) const
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
__host__ __device__ IndexT getStride(int i) const