Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
Transpose.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include "../../FaissAssert.h"
14 #include "Tensor.cuh"
15 #include "DeviceUtils.h"
16 #include <cuda.h>
17 
18 #include <stdio.h>
19 
20 namespace faiss { namespace gpu {
21 
22 template <typename T>
23 struct TensorInfo {
24  static constexpr int kMaxDims = 8;
25 
26  T* data;
27  int sizes[kMaxDims];
28  int strides[kMaxDims];
29  int dims;
30 };
31 
32 template <typename T, int Dim>
34  __device__ inline static unsigned int get(const TensorInfo<T>& info,
35  unsigned int linearId) {
36  unsigned int offset = 0;
37 
38 #pragma unroll
39  for (int i = Dim - 1; i >= 0; --i) {
40  unsigned int curDimIndex = linearId % info.sizes[i];
41  unsigned int curDimOffset = curDimIndex * info.strides[i];
42 
43  offset += curDimOffset;
44 
45  if (i > 0) {
46  linearId /= info.sizes[i];
47  }
48  }
49 
50  return offset;
51  }
52 };
53 
54 template <typename T>
55 struct TensorInfoOffset<T, -1> {
56  __device__ inline static unsigned int get(const TensorInfo<T>& info,
57  unsigned int linearId) {
58  return linearId;
59  }
60 };
61 
62 template <typename T, int Dim>
63 TensorInfo<T> getTensorInfo(const Tensor<T, Dim, true>& t) {
64  TensorInfo<T> info;
65 
66  for (int i = 0; i < Dim; ++i) {
67  info.sizes[i] = t.getSize(i);
68  info.strides[i] = t.getStride(i);
69  }
70 
71  info.data = t.data();
72  info.dims = Dim;
73 
74  return info;
75 }
76 
77 template <typename T, int DimInput, int DimOutput>
78 __global__ void transposeAny(TensorInfo<T> input,
79  TensorInfo<T> output,
80  unsigned int totalSize) {
81  auto linearThreadId = blockIdx.x * blockDim.x + threadIdx.x;
82 
83  if (linearThreadId >= totalSize) {
84  return;
85  }
86 
87  auto inputOffset =
88  TensorInfoOffset<T, DimInput>::get(input, linearThreadId);
89  auto outputOffset =
90  TensorInfoOffset<T, DimOutput>::get(output, linearThreadId);
91 
92 #if __CUDA_ARCH__ >= 350
93  output.data[outputOffset] = __ldg(&input.data[inputOffset]);
94 #else
95  output.data[outputOffset] = input.data[inputOffset];
96 #endif
97 }
98 
99 /// Performs an out-of-place transposition between any two dimensions.
100 /// Best performance is if the transposed dimensions are not
101 /// innermost, since the reads and writes will be coalesced.
102 /// Could include a shared memory transposition if the dimensions
103 /// being transposed are innermost, but would require support for
104 /// arbitrary rectangular matrices.
105 /// This linearized implementation seems to perform well enough,
106 /// especially for cases that we care about (outer dimension
107 /// transpositions).
108 template <typename T, int Dim>
109 void runTransposeAny(Tensor<T, Dim, true>& in,
110  int dim1, int dim2,
111  Tensor<T, Dim, true>& out,
112  cudaStream_t stream) {
113  static_assert(Dim <= TensorInfo<T>::kMaxDims, "too many dimensions");
114 
115  FAISS_ASSERT(dim1 != dim2);
116  FAISS_ASSERT(dim1 < Dim && dim2 < Dim);
117 
118  int outSize[Dim];
119 
120  for (int i = 0; i < Dim; ++i) {
121  outSize[i] = in.getSize(i);
122  }
123 
124  std::swap(outSize[dim1], outSize[dim2]);
125 
126  for (int i = 0; i < Dim; ++i) {
127  FAISS_ASSERT(out.getSize(i) == outSize[i]);
128  }
129 
130  auto inInfo = getTensorInfo<T, Dim>(in);
131  auto outInfo = getTensorInfo<T, Dim>(out);
132 
133  std::swap(inInfo.sizes[dim1], inInfo.sizes[dim2]);
134  std::swap(inInfo.strides[dim1], inInfo.strides[dim2]);
135 
136  int totalSize = in.numElements();
137 
138  int numThreads = std::min(getMaxThreadsCurrentDevice(), totalSize);
139  auto grid = dim3(utils::divUp(totalSize, numThreads));
140  auto block = dim3(numThreads);
141 
142  transposeAny<T, Dim, -1><<<grid, block, 0, stream>>>(
143  inInfo, outInfo, totalSize);
144  CUDA_TEST_ERROR();
145 }
146 
147 } } // namespace
__host__ __device__ DataPtrType data()
Returns a raw pointer to the start of our data.
Definition: Tensor.cuh:173
__host__ __device__ IndexT getStride(int i) const
Definition: Tensor.cuh:227
Our tensor type.
Definition: Tensor.cuh:30
__host__ __device__ IndexT getSize(int i) const
Definition: Tensor.cuh:221