Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
DeviceVector.cuh
1 /**
2  * Copyright (c) 2015-present, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD+Patents license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 // Copyright 2004-present Facebook. All Rights Reserved.
10 
11 #pragma once
12 
13 #include "../../FaissAssert.h"
14 #include "DeviceUtils.h"
15 #include "MemorySpace.h"
16 #include "StaticUtils.h"
17 #include <algorithm>
18 #include <cuda.h>
19 #include <vector>
20 
21 namespace faiss { namespace gpu {
22 
23 /// A simple version of thrust::device_vector<T>, but has more control
24 /// over whether resize() initializes new space with T() (which we
25 /// don't want), and control on how much the reserved space grows by
26 /// upon resize/reserve. It is also meant for POD types only.
27 template <typename T>
28 class DeviceVector {
29  public:
30  DeviceVector(MemorySpace space = MemorySpace::Device)
31  : data_(nullptr),
32  num_(0),
33  capacity_(0),
34  space_(space) {
35  }
36 
37  ~DeviceVector() {
38  clear();
39  }
40 
41  // Clear all allocated memory; reset to zero size
42  void clear() {
43  CUDA_VERIFY(cudaFree(data_));
44  data_ = nullptr;
45  num_ = 0;
46  capacity_ = 0;
47  }
48 
49  size_t size() const { return num_; }
50  size_t capacity() const { return capacity_; }
51  T* data() { return data_; }
52  const T* data() const { return data_; }
53 
54  template <typename OutT>
55  std::vector<OutT> copyToHost(cudaStream_t stream) const {
56  FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0);
57 
58  std::vector<OutT> out((num_ * sizeof(T)) / sizeof(OutT));
59  CUDA_VERIFY(cudaMemcpyAsync(out.data(), data_, num_ * sizeof(T),
60  cudaMemcpyDeviceToHost, stream));
61 
62  return out;
63  }
64 
65  // Returns true if we actually reallocated memory
66  // If `reserveExact` is true, then we reserve only the memory that
67  // we need for what we're appending
68  bool append(const T* d,
69  size_t n,
70  cudaStream_t stream,
71  bool reserveExact = false) {
72  bool mem = false;
73 
74  if (n > 0) {
75  size_t reserveSize = num_ + n;
76  if (!reserveExact) {
77  reserveSize = getNewCapacity_(reserveSize);
78  }
79 
80  mem = reserve(reserveSize, stream);
81 
82  int dev = getDeviceForAddress(d);
83  if (dev == -1) {
84  CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
85  cudaMemcpyHostToDevice, stream));
86  } else {
87  CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
88  cudaMemcpyDeviceToDevice, stream));
89  }
90  num_ += n;
91  }
92 
93  return mem;
94  }
95 
96  // Returns true if we actually reallocated memory
97  bool resize(size_t newSize, cudaStream_t stream) {
98  bool mem = false;
99 
100  if (num_ < newSize) {
101  mem = reserve(getNewCapacity_(newSize), stream);
102  }
103 
104  // Don't bother zero initializing the newly accessible memory
105  // (unlike thrust::device_vector)
106  num_ = newSize;
107 
108  return mem;
109  }
110 
111  // Clean up after oversized allocations, while leaving some space to
112  // remain for subsequent allocations (if `exact` false) or to
113  // exactly the space we need (if `exact` true); returns space
114  // reclaimed in bytes
115  size_t reclaim(bool exact, cudaStream_t stream) {
116  size_t free = capacity_ - num_;
117 
118  if (exact) {
119  realloc_(num_, stream);
120  return free * sizeof(T);
121  }
122 
123  // If more than 1/4th of the space is free, then we want to
124  // truncate to only having 1/8th of the space free; this still
125  // preserves some space for new elements, but won't force us to
126  // double our size right away
127  if (free > (capacity_ / 4)) {
128  size_t newFree = capacity_ / 8;
129  size_t newCapacity = num_ + newFree;
130 
131  size_t oldCapacity = capacity_;
132  FAISS_ASSERT(newCapacity < oldCapacity);
133 
134  realloc_(newCapacity, stream);
135 
136  return (oldCapacity - newCapacity) * sizeof(T);
137  }
138 
139  return 0;
140  }
141 
142  // Returns true if we actually reallocated memory
143  bool reserve(size_t newCapacity, cudaStream_t stream) {
144  if (newCapacity <= capacity_) {
145  return false;
146  }
147 
148  // Otherwise, we need new space.
149  realloc_(newCapacity, stream);
150  return true;
151  }
152 
153  private:
154  void realloc_(size_t newCapacity, cudaStream_t stream) {
155  FAISS_ASSERT(num_ <= newCapacity);
156 
157  T* newData = nullptr;
158  allocMemorySpace(space_, (void**) &newData, newCapacity * sizeof(T));
159  CUDA_VERIFY(cudaMemcpyAsync(newData, data_, num_ * sizeof(T),
160  cudaMemcpyDeviceToDevice, stream));
161  // FIXME: keep on reclamation queue to avoid hammering cudaFree?
162  CUDA_VERIFY(cudaFree(data_));
163 
164  data_ = newData;
165  capacity_ = newCapacity;
166  }
167 
168  size_t getNewCapacity_(size_t preferredSize) {
169  return utils::nextHighestPowerOf2(preferredSize);
170  }
171 
172  T* data_;
173  size_t num_;
174  size_t capacity_;
175  MemorySpace space_;
176 };
177 
178 } } // namespace