Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Transpose.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 #include "../../FaissAssert.h"
15 #include "Tensor.cuh"
16 #include "DeviceUtils.h"
17 #include <cuda.h>
18 
19 #include <stdio.h>
20 
21 namespace faiss { namespace gpu {
22 
23 template <typename T>
24 struct TensorInfo {
25  static constexpr int kMaxDims = 8;
26 
27  T* data;
28  int sizes[kMaxDims];
29  int strides[kMaxDims];
30  int dims;
31 };
32 
33 template <typename T, int Dim>
35  __device__ inline static unsigned int get(const TensorInfo<T>& info,
36  unsigned int linearId) {
37  unsigned int offset = 0;
38 
39 #pragma unroll
40  for (int i = Dim - 1; i >= 0; --i) {
41  unsigned int curDimIndex = linearId % info.sizes[i];
42  unsigned int curDimOffset = curDimIndex * info.strides[i];
43 
44  offset += curDimOffset;
45 
46  if (i > 0) {
47  linearId /= info.sizes[i];
48  }
49  }
50 
51  return offset;
52  }
53 };
54 
55 template <typename T>
56 struct TensorInfoOffset<T, -1> {
57  __device__ inline static unsigned int get(const TensorInfo<T>& info,
58  unsigned int linearId) {
59  return linearId;
60  }
61 };
62 
63 template <typename T, int Dim>
64 TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
65  TensorInfo<T> info;
66 
67  for (int i = 0; i < Dim; ++i) {
68  info.sizes[i] = t.getSize(i);
69  info.strides[i] = t.getStride(i);
70  }
71 
72  info.data = t.data();
73  info.dims = Dim;
74 
75  return info;
76 }
77 
78 template <typename T, int DimInput, int DimOutput>
79 __global__ void transposeAny(TensorInfo<T> input,
80  TensorInfo<T> output,
81  unsigned int totalSize) {
82  auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
83 
84  if (linearThreadId >= totalSize) {
85  return;
86  }
87 
88  auto inputOffset =
89  TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
90  auto outputOffset =
91  TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
92 
93  output.data[outputOffset] = __ldg(&input.data[inputOffset]);
94 }
95 
96 /// Performs an out-of-place transposition between any two dimensions.
97 /// Best performance is if the transposed dimensions are not
98 /// innermost, since the reads and writes will be coalesced.
99 /// Could include a shared memory transposition if the dimensions
100 /// being transposed are innermost, but would require support for
101 /// arbitrary rectangular matrices.
102 /// This linearized implementation seems to perform well enough,
103 /// especially for cases that we care about (outer dimension
104 /// transpositions).
105 template <typename T, int Dim>
106 void runTransposeAny(Tensor<T, Dim, true>& in,
107  int dim1, int dim2,
108  Tensor<T, Dim, true>& out,
109  cudaStream_t stream) {
110  static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
111 
112  FAISS_ASSERT(dim1 != dim2);
113  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
114 
115  int outSize[Dim];
116 
117  for (int i = 0; i < Dim; ++i) {
118  outSize[i] = in.getSize(i);
119  }
120 
121  std::swap(outSize[dim1], outSize[dim2]);
122 
123  for (int i = 0; i < Dim; ++i) {
124  FAISS_ASSERT(out.getSize(i) == outSize[i]);
125  }
126 
127  auto inInfo = getTensorInfo<T, Dim>(in);
128  auto outInfo = getTensorInfo<T, Dim>(out);
129 
130  std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
131  std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
132 
133  int totalSize = in.numElements();
134 
135  int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
136  auto grid = dim3(utils::divUp(totalSize, numThreads));
137  auto block = dim3(numThreads);
138 
139  transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
140  CUDA_VERIFY(cudaGetLastError());
141 }
142 
143 } } // namespace
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:162
__host__ __device__ IndexT getStride(int i) const
Definition: Tensor.cuh:216
Our tensor type.
Definition: Tensor.cuh:31
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:210