12 #include <cuda_profiler_api.h>
13 #include "../../IndexFlat.h"
14 #include "../../IndexIVFPQ.h"
15 #include "../GpuIndexIVFPQ.h"
16 #include "../StandardGpuResources.h"
17 #include "../test/TestUtils.h"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/Timer.h"
20 #include <gflags/gflags.h>
24 DEFINE_int32(batches, 10,
"number of batches of vectors to add");
25 DEFINE_int32(batch_size, 10000,
"number of vectors in each batch");
26 DEFINE_int32(dim, 256,
"dimension of vectors");
27 DEFINE_int32(centroids, 4096,
"num coarse centroids to use");
28 DEFINE_int32(bytes_per_vec, 32,
"bytes per encoded vector");
29 DEFINE_int32(bits_per_code, 8,
"bits per PQ code");
30 DEFINE_int32(index, 2,
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
31 DEFINE_bool(time_gpu,
true,
"time add to GPU");
32 DEFINE_bool(time_cpu,
false,
"time add to CPU");
33 DEFINE_bool(per_batch_time,
false,
"print per-batch times");
34 DEFINE_bool(reserve_memory,
false,
"whether or not to pre-reserve memory");
36 int main(
int argc,
char** argv) {
37 gflags::ParseCommandLineFlags(&argc, &argv,
true);
42 int numCentroids = FLAGS_centroids;
43 int bytesPerVec = FLAGS_bytes_per_vec;
44 int bitsPerCode = FLAGS_bits_per_code;
49 int numTrain = 4 * numCentroids;
50 std::vector<float> trainVecs = faiss::gpu::randVecs(numTrain, dim);
54 bytesPerVec, bitsPerCode);
56 cpuIndex.train(numTrain, trainVecs.data());
64 &res, dim, numCentroids, bytesPerVec, bitsPerCode,
65 faiss::METRIC_L2, config);
68 gpuIndex.train(numTrain, trainVecs.data());
69 if (FLAGS_reserve_memory) {
70 size_t numVecs = (size_t) FLAGS_batches * (
size_t) FLAGS_batch_size;
71 gpuIndex.reserveMemory(numVecs);
75 cudaDeviceSynchronize();
76 CUDA_VERIFY(cudaProfilerStart());
78 float totalGpuTime = 0.0f;
79 float totalCpuTime = 0.0f;
81 for (
int i = 0; i < FLAGS_batches; ++i) {
82 if (!FLAGS_per_batch_time) {
84 printf(
"Adding batch %d\n", i + 1);
88 auto addVecs = faiss::gpu::randVecs(FLAGS_batch_size, dim);
92 gpuIndex.add(FLAGS_batch_size, addVecs.data());
93 CUDA_VERIFY(cudaDeviceSynchronize());
98 if (FLAGS_per_batch_time) {
99 printf(
"Batch %d | GPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
100 i + 1, FLAGS_batch_size, time, time / (
float) FLAGS_batch_size);
104 if (FLAGS_time_cpu) {
106 cpuIndex.add(FLAGS_batch_size, addVecs.data());
109 totalCpuTime += time;
111 if (FLAGS_per_batch_time) {
112 printf(
"Batch %d | CPU time to add %d vecs: %.3f ms (%.5f ms per)\n",
113 i + 1, FLAGS_batch_size, time, time / (
float) FLAGS_batch_size);
118 CUDA_VERIFY(cudaProfilerStop());
120 int total = FLAGS_batch_size * FLAGS_batches;
122 if (FLAGS_time_gpu) {
123 printf(
"%d dim, %d centroids, %d x %d encoding\n"
124 "GPU time to add %d vectors (%d batches, %d per batch): "
125 "%.3f ms (%.3f us per)\n",
126 dim, numCentroids, bytesPerVec, bitsPerCode,
127 total, FLAGS_batches, FLAGS_batch_size,
128 totalGpuTime, totalGpuTime * 1000.0f / (
float) total);
131 if (FLAGS_time_cpu) {
132 printf(
"%d dim, %d centroids, %d x %d encoding\n"
133 "CPU time to add %d vectors (%d batches, %d per batch): "
134 "%.3f ms (%.3f us per)\n",
135 dim, numCentroids, bytesPerVec, bitsPerCode,
136 total, FLAGS_batches, FLAGS_batch_size,
137 totalCpuTime, totalCpuTime * 1000.0f / (
float) total);
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
CPU wallclock elapsed timer.
int device
GPU device on which the index is resident.
IndicesOptions indicesOptions
Index storage options for the GPU.