Faiss
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends
DeviceVector.cuh
1 
2 /**
3  * Copyright (c) 2015-present, Facebook, Inc.
4  * All rights reserved.
5  *
6  * This source code is licensed under the CC-by-NC license found in the
7  * LICENSE file in the root directory of this source tree.
8  */
9 
10 // Copyright 2004-present Facebook. All Rights Reserved.
11 
12 #pragma once
13 
14 #include "../../FaissAssert.h"
15 #include "DeviceUtils.h"
16 #include "StaticUtils.h"
17 #include <algorithm>
18 #include <cuda.h>
19 #include <vector>
20 
21 namespace faiss { namespace gpu {
22 
23 /// A simple version of thrust::device_vector<T>, but has more control
24 /// over whether resize() initializes new space with T() (which we
25 /// don't want), and control on how much the reserved space grows by
26 /// upon resize/reserve. It is also meant for POD types only.
27 template <typename T>
28 class DeviceVector {
29  public:
30  DeviceVector()
31  : data_(nullptr),
32  num_(0),
33  capacity_(0) {
34  }
35 
36  ~DeviceVector() {
37  clear();
38  }
39 
40  // Clear all allocated memory; reset to zero size
41  void clear() {
42  CUDA_VERIFY(cudaFree(data_));
43  data_ = nullptr;
44  num_ = 0;
45  capacity_ = 0;
46  }
47 
48  size_t size() const { return num_; }
49  size_t capacity() const { return capacity_; }
50  T* data() { return data_; }
51  const T* data() const { return data_; }
52 
53  template <typename OutT>
54  std::vector<OutT> copyToHost(cudaStream_t stream) const {
55  FAISS_ASSERT(num_ * sizeof(T) % sizeof(OutT) == 0);
56 
57  std::vector<OutT> out((num_ * sizeof(T)) / sizeof(OutT));
58  CUDA_VERIFY(cudaMemcpyAsync(out.data(), data_, num_ * sizeof(T),
59  cudaMemcpyDeviceToHost, stream));
60 
61  return out;
62  }
63 
64  // Returns true if we actually reallocated memory
65  // If `reserveExact` is true, then we reserve only the memory that
66  // we need for what we're appending
67  bool append(const T* d,
68  size_t n,
69  cudaStream_t stream,
70  bool reserveExact = false) {
71  bool mem = false;
72 
73  if (n > 0) {
74  size_t reserveSize = num_ + n;
75  if (!reserveExact) {
76  reserveSize = getNewCapacity_(reserveSize);
77  }
78 
79  mem = reserve(reserveSize, stream);
80 
81  int dev = getDeviceForAddress(d);
82  if (dev == -1) {
83  CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
84  cudaMemcpyHostToDevice, stream));
85  } else {
86  CUDA_VERIFY(cudaMemcpyAsync(data_ + num_, d, n * sizeof(T),
87  cudaMemcpyDeviceToDevice, stream));
88  }
89  num_ += n;
90  }
91 
92  return mem;
93  }
94 
95  // Returns true if we actually reallocated memory
96  bool resize(size_t newSize, cudaStream_t stream) {
97  bool mem = false;
98 
99  if (num_ < newSize) {
100  mem = reserve(getNewCapacity_(newSize), stream);
101  }
102 
103  // Don't bother zero initializing the newly accessible memory
104  // (unlike thrust::device_vector)
105  num_ = newSize;
106 
107  return mem;
108  }
109 
110  // Clean up after oversized allocations, while leaving some space to
111  // remain for subsequent allocations (if `exact` false) or to
112  // exactly the space we need (if `exact` true); returns space
113  // reclaimed in bytes
114  size_t reclaim(bool exact, cudaStream_t stream) {
115  size_t free = capacity_ - num_;
116 
117  if (exact) {
118  realloc_(num_, stream);
119  return free * sizeof(T);
120  }
121 
122  // If more than 1/4th of the space is free, then we want to
123  // truncate to only having 1/8th of the space free; this still
124  // preserves some space for new elements, but won't force us to
125  // double our size right away
126  if (free > (capacity_ / 4)) {
127  size_t newFree = capacity_ / 8;
128  size_t newCapacity = num_ + newFree;
129 
130  size_t oldCapacity = capacity_;
131  FAISS_ASSERT(newCapacity < oldCapacity);
132 
133  realloc_(newCapacity, stream);
134 
135  return (oldCapacity - newCapacity) * sizeof(T);
136  }
137 
138  return 0;
139  }
140 
141  // Returns true if we actually reallocated memory
142  bool reserve(size_t newCapacity, cudaStream_t stream) {
143  if (newCapacity <= capacity_) {
144  return false;
145  }
146 
147  // Otherwise, we need new space.
148  realloc_(newCapacity, stream);
149  return true;
150  }
151 
152  private:
153  void realloc_(size_t newCapacity, cudaStream_t stream) {
154  FAISS_ASSERT(num_ <= newCapacity);
155 
156  T* newData = nullptr;
157  CUDA_VERIFY(cudaMalloc(&newData, newCapacity * sizeof(T)));
158  CUDA_VERIFY(cudaMemcpyAsync(newData, data_, num_ * sizeof(T),
159  cudaMemcpyDeviceToDevice, stream));
160  // FIXME: keep on reclamation queue to avoid hammering cudaFree?
161  CUDA_VERIFY(cudaFree(data_));
162 
163  data_ = newData;
164  capacity_ = newCapacity;
165  }
166 
167  size_t getNewCapacity_(size_t preferredSize) {
168  return utils::nextHighestPowerOf2(preferredSize);
169  }
170 
171  T* data_;
172  size_t num_;
173  size_t capacity_;
174 };
175 
176 } } // namespace