12 #include "../../IndexIVFPQ.h"
13 #include "../../index_io.h"
14 #include "../../utils.h"
16 #include "../GpuIndexIVFPQ.h"
17 #include "IndexWrapper.h"
18 #include "../test/TestUtils.h"
19 #include "../utils/DeviceTensor.cuh"
20 #include "../utils/DeviceUtils.h"
21 #include "../utils/HostTensor.cuh"
22 #include "../utils/Timer.h"
23 #include <gflags/gflags.h>
27 DEFINE_int32(k, 10,
"final number of closest results returned");
28 DEFINE_string(in,
"/home/jhj/local/ivfpq_index.out",
"index file for input");
29 DEFINE_bool(use_precomputed,
true,
"enable or disable precomputed codes");
30 DEFINE_bool(float16_lookup,
false,
"use float16 residual distance tables");
31 DEFINE_int32(num_gpus, 1,
"number of gpus to use");
32 DEFINE_int32(index, 2,
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
34 using namespace faiss::gpu;
36 int main(
int argc,
char** argv) {
37 google::ParseCommandLineFlags(&argc, &argv,
true);
39 auto seed = time(
nullptr);
42 auto index = std::unique_ptr<faiss::IndexIVFPQ>(
44 FAISS_ASSERT((
bool) index);
47 auto codes = index->pq.M;
48 auto bitsPerCode = index->pq.nbits;
50 printf(
"Database: dim %d num vecs %ld\n", dim, index->ntotal);
51 printf(
"Coarse centroids: %ld\n", index->quantizer->ntotal);
52 printf(
"PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
53 printf(
"L2 lookup: total k %d, precomputed codes %d\n\n",
54 k, FLAGS_use_precomputed);
57 printf(
"Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
59 bool precomp = FLAGS_use_precomputed;
60 auto indicesOpt = (faiss::gpu::IndicesOptions) FLAGS_index;
61 auto useFloat16Lookup = FLAGS_float16_lookup;
63 auto initFn = [precomp, indicesOpt, useFloat16Lookup, &index]
65 std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
66 auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
72 p->setPrecomputedCodes(precomp);
78 printf(
"copy done\n");
80 auto querySizes = std::vector<int>{1, 4, 16, 64, 256, 1024, 4096, 16384};
81 auto nprobeSizes = std::vector<int>{1, 4, 8, 16, 32, 64, 128, 256};
84 {(int) querySizes.size(), (int) nprobeSizes.size()});
86 {(int) querySizes.size(), (int) nprobeSizes.size()});
88 printf(
"GPU relative speedup over CPU (x):\n");
90 for (
auto q = 0; q < querySizes.size(); ++q) {
91 auto numQueries = querySizes[q];
94 for (
auto p = 0; p < nprobeSizes.size(); ++p) {
95 auto nprobe = nprobeSizes[q];
98 faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
103 index->nprobe = nprobe;
105 float cpuTime = 0.0f;
108 if (!FLAGS_use_precomputed) {
109 index->use_precomputed_table = 0;
111 index->search(numQueries, cpuQuery.data(),
112 k, resultDistances.data(), resultIndices.data());
114 cpuTimePerVector[q][p] = cpuTime / (float) numQueries;
117 gpuIndex.setNumProbes(nprobe);
120 gpuHostDistances({numQueries, k});
122 gpuHostIndices({numQueries, k});
127 CUDA_VERIFY(cudaDeviceSynchronize());
129 float gpuTime = 0.0f;
134 gpuIndex.getIndex()->search(cpuQuery.getSize(0),
137 gpuHostDistances.data(),
138 gpuHostIndices.data());
140 CUDA_VERIFY(cudaDeviceSynchronize());
144 gpuTimePerVector[q][p] = gpuTime / (float) numQueries;
151 printf(
"%.2f", cpuTime / gpuTime);
158 printf(
"CPU time per query vector (us):\n");
160 for (
int q = 0; q < cpuTimePerVector.getSize(0); ++q) {
163 for (
int p = 0; p < cpuTimePerVector.getSize(1); ++p) {
169 printf(
"%.1f", (
float) cpuTimePerVector[q][p] * 1000.0f);
176 printf(
"GPU time per query vector (us):\n");
178 for (
int q = 0; q < gpuTimePerVector.getSize(0); ++q) {
181 for (
int p = 0; p < gpuTimePerVector.getSize(1); ++p) {
187 printf(
"%.1f", (
float) gpuTimePerVector[q][p] * 1000.0f);
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
Index * read_index(FILE *f, bool try_mmap)
CPU wallclock elapsed timer.