Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Transpose.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include "../../FaissAssert.h"
14 #include "Tensor.cuh"
15 #include "DeviceUtils.h"
16 #include <cuda.h>
17 
18 #include <stdio.h>
19 
20 namespace faiss { namespace gpu {
21 
22 template <typename T, typename IndexT>
23 struct TensorInfo {
24  static constexpr int kMaxDims = 8;
25 
26  T* data;
27  IndexT sizes[kMaxDims];
28  IndexT strides[kMaxDims];
29  int dims;
30 };
31 
32 template <typename T, typename IndexT, int Dim>
34  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
35  IndexT linearId) {
36  IndexT offset = 0;
37 
38 #pragma unroll
39  for (int i = Dim - 1; i >= 0; --i) {
40  IndexT curDimIndex = linearId % info.sizes[i];
41  IndexT curDimOffset = curDimIndex * info.strides[i];
42 
43  offset += curDimOffset;
44 
45  if (i > 0) {
46  linearId /= info.sizes[i];
47  }
48  }
49 
50  return offset;
51  }
52 };
53 
54 template <typename T, typename IndexT>
55 struct TensorInfoOffset<T, IndexT, -1> {
56  __device__ inline static unsigned int get(const TensorInfo<T, IndexT>& info,
57  IndexT linearId) {
58  return linearId;
59  }
60 };
61 
62 template <typename T, typename IndexT, int Dim>
63 TensorInfo<T, IndexT> getTensorInfo(const Tensor<T, Dim, true>& t) {
65 
66  for (int i = 0; i < Dim; ++i) {
67  info.sizes[i] = (IndexT) t.getSize(i);
68  info.strides[i] = (IndexT) t.getStride(i);
69  }
70 
71  info.data = t.data();
72  info.dims = Dim;
73 
74  return info;
75 }
76 
77 template <typename T, typename IndexT, int DimInput, int DimOutput>
78 __global__ void transposeAny(TensorInfo<T, IndexT> input,
79  TensorInfo<T, IndexT> output,
80  IndexT totalSize) {
81  for (IndexT i = blockIdx.x * blockDim.x + threadIdx.x;
82  i < totalSize;
83  i += gridDim.x + blockDim.x) {
84  auto inputOffset = TensorInfoOffset<T, IndexT, DimInput>::get(input, i);
85  auto outputOffset = TensorInfoOffset<T, IndexT, DimOutput>::get(output, i);
86 
87 #if __CUDA_ARCH__ >= 350
88  output.data[outputOffset] = __ldg(&input.data[inputOffset]);
89 #else
90  output.data[outputOffset] = input.data[inputOffset];
91 #endif
92  }
93 }
94 
95 /// Performs an out-of-place transposition between any two dimensions.
96 /// Best performance is if the transposed dimensions are not
97 /// innermost, since the reads and writes will be coalesced.
98 /// Could include a shared memory transposition if the dimensions
99 /// being transposed are innermost, but would require support for
100 /// arbitrary rectangular matrices.
101 /// This linearized implementation seems to perform well enough,
102 /// especially for cases that we care about (outer dimension
103 /// transpositions).
104 template <typename T, int Dim>
105 void runTransposeAny(Tensor<T, Dim, true>& in,
106  int dim1, int dim2,
107  Tensor<T, Dim, true>& out,
108  cudaStream_t stream) {
109  static_assert(Dim <= TensorInfo<T, unsigned int>::kMaxDims,
110  "too many dimensions");
111 
112  FAISS_ASSERT(dim1 != dim2);
113  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
114 
115  int outSize[Dim];
116 
117  for (int i = 0; i < Dim; ++i) {
118  outSize[i] = in.getSize(i);
119  }
120 
121  std::swap(outSize[dim1], outSize[dim2]);
122 
123  for (int i = 0; i < Dim; ++i) {
124  FAISS_ASSERT(out.getSize(i) == outSize[i]);
125  }
126 
127  size_t totalSize = in.numElements();
128  size_t block = std::min((size_t) getMaxThreadsCurrentDevice(), totalSize);
129 
130  if (totalSize <= (size_t) std::numeric_limits<int>::max()) {
131  // div/mod seems faster with unsigned types
132  auto inInfo = getTensorInfo<T, unsigned int, Dim>(in);
133  auto outInfo = getTensorInfo<T, unsigned int, Dim>(out);
134 
135  std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
136  std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
137 
138  auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
139 
140  transposeAny<T, unsigned int, Dim, -1>
141  <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
142  } else {
143  auto inInfo = getTensorInfo<T, unsigned long, Dim>(in);
144  auto outInfo = getTensorInfo<T, unsigned long, Dim>(out);
145 
146  std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
147  std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
148 
149  auto grid = std::min(utils::divUp(totalSize, block), (size_t) 4096);
150 
151  transposeAny<T, unsigned long, Dim, -1>
152  <<<grid, block, 0, stream>>>(inInfo, outInfo, totalSize);
153  }
154  CUDA_TEST_ERROR();
155 }
156 
157 } } // namespace
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:224
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:176
Our tensor type.
Definition: Tensor.cuh:30
__host__ __device__ IndexT getStride(int i) const
Definition: Tensor.cuh:230