10 #include "../../IndexIVFPQ.h"
11 #include "../../index_io.h"
12 #include "../../utils.h"
14 #include "../GpuIndexIVFPQ.h"
15 #include "IndexWrapper.h"
16 #include "../test/TestUtils.h"
17 #include "../utils/DeviceTensor.cuh"
18 #include "../utils/DeviceUtils.h"
19 #include "../utils/HostTensor.cuh"
20 #include "../utils/Timer.h"
21 #include <gflags/gflags.h>
25 DEFINE_int32(k, 10,
"final number of closest results returned");
26 DEFINE_string(in,
"/home/jhj/local/ivfpq_index.out",
"index file for input");
27 DEFINE_bool(use_precomputed,
true,
"enable or disable precomputed codes");
28 DEFINE_bool(float16_lookup,
false,
"use float16 residual distance tables");
29 DEFINE_int32(num_gpus, 1,
"number of gpus to use");
30 DEFINE_int32(index, 2,
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
32 using namespace faiss::gpu;
34 int main(
int argc,
char** argv) {
35 gflags::ParseCommandLineFlags(&argc, &argv,
true);
37 auto seed = time(
nullptr);
40 auto index = std::unique_ptr<faiss::IndexIVFPQ>(
42 FAISS_ASSERT((
bool) index);
45 auto codes = index->pq.M;
46 auto bitsPerCode = index->pq.nbits;
48 printf(
"Database: dim %d num vecs %ld\n", dim, index->ntotal);
49 printf(
"Coarse centroids: %ld\n", index->quantizer->ntotal);
50 printf(
"PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
51 printf(
"L2 lookup: total k %d, precomputed codes %d\n\n",
52 k, FLAGS_use_precomputed);
55 printf(
"Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
57 bool precomp = FLAGS_use_precomputed;
58 auto indicesOpt = (faiss::gpu::IndicesOptions) FLAGS_index;
59 auto useFloat16Lookup = FLAGS_float16_lookup;
61 auto initFn = [precomp, indicesOpt, useFloat16Lookup, &index]
63 std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
71 auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
78 printf(
"copy done\n");
80 auto querySizes = std::vector<int>{1, 4, 16, 64, 256, 1024, 4096, 16384};
81 auto nprobeSizes = std::vector<int>{1, 4, 8, 16, 32, 64, 128, 256};
84 {(int) querySizes.size(), (int) nprobeSizes.size()});
86 {(int) querySizes.size(), (int) nprobeSizes.size()});
88 printf(
"GPU relative speedup over CPU (x):\n");
90 for (
auto q = 0; q < querySizes.size(); ++q) {
91 auto numQueries = querySizes[q];
94 for (
auto p = 0; p < nprobeSizes.size(); ++p) {
95 auto nprobe = nprobeSizes[q];
98 faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
103 index->nprobe = nprobe;
105 float cpuTime = 0.0f;
108 if (!FLAGS_use_precomputed) {
109 index->use_precomputed_table = 0;
111 index->search(numQueries, cpuQuery.data(),
112 k, resultDistances.data(), resultIndices.data());
114 cpuTimePerVector[q][p] = cpuTime / (float) numQueries;
117 gpuIndex.setNumProbes(nprobe);
120 gpuHostDistances({numQueries, k});
122 gpuHostIndices({numQueries, k});
127 CUDA_VERIFY(cudaDeviceSynchronize());
129 float gpuTime = 0.0f;
134 gpuIndex.getIndex()->search(cpuQuery.getSize(0),
137 gpuHostDistances.data(),
138 gpuHostIndices.data());
140 CUDA_VERIFY(cudaDeviceSynchronize());
144 gpuTimePerVector[q][p] = gpuTime / (float) numQueries;
151 printf(
"%.2f", cpuTime / gpuTime);
158 printf(
"CPU time per query vector (us):\n");
160 for (
int q = 0; q < cpuTimePerVector.getSize(0); ++q) {
163 for (
int p = 0; p < cpuTimePerVector.getSize(1); ++p) {
169 printf(
"%.1f", (
float) cpuTimePerVector[q][p] * 1000.0f);
176 printf(
"GPU time per query vector (us):\n");
178 for (
int q = 0; q < gpuTimePerVector.getSize(0); ++q) {
181 for (
int p = 0; p < gpuTimePerVector.getSize(1); ++p) {
187 printf(
"%.1f", (
float) gpuTimePerVector[q][p] * 1000.0f);
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
bool usePrecomputedTables
CPU wallclock elapsed timer.
bool useFloat16LookupTables
int device
GPU device on which the index is resident.
IndicesOptions indicesOptions
Index storage options for the GPU.