11 #include "../../IndexIVFPQ.h"
12 #include "../../index_io.h"
13 #include "../../utils.h"
15 #include "../GpuIndexIVFPQ.h"
16 #include "IndexWrapper.h"
17 #include "../test/TestUtils.h"
18 #include "../utils/DeviceTensor.cuh"
19 #include "../utils/DeviceUtils.h"
20 #include "../utils/HostTensor.cuh"
21 #include "../utils/Timer.h"
22 #include <gflags/gflags.h>
26 DEFINE_int32(k, 10,
"final number of closest results returned");
27 DEFINE_string(in,
"/home/jhj/local/ivfpq_index.out",
"index file for input");
28 DEFINE_bool(use_precomputed,
true,
"enable or disable precomputed codes");
29 DEFINE_bool(float16_lookup,
false,
"use float16 residual distance tables");
30 DEFINE_int32(num_gpus, 1,
"number of gpus to use");
31 DEFINE_int32(index, 2,
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
33 using namespace faiss::gpu;
35 int main(
int argc,
char** argv) {
36 gflags::ParseCommandLineFlags(&argc, &argv,
true);
38 auto seed = time(
nullptr);
41 auto index = std::unique_ptr<faiss::IndexIVFPQ>(
43 FAISS_ASSERT((
bool) index);
46 auto codes = index->pq.M;
47 auto bitsPerCode = index->pq.nbits;
49 printf(
"Database: dim %d num vecs %ld\n", dim, index->ntotal);
50 printf(
"Coarse centroids: %ld\n", index->quantizer->ntotal);
51 printf(
"PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
52 printf(
"L2 lookup: total k %d, precomputed codes %d\n\n",
53 k, FLAGS_use_precomputed);
56 printf(
"Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
58 bool precomp = FLAGS_use_precomputed;
59 auto indicesOpt = (faiss::gpu::IndicesOptions) FLAGS_index;
60 auto useFloat16Lookup = FLAGS_float16_lookup;
62 auto initFn = [precomp, indicesOpt, useFloat16Lookup, &index]
64 std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
72 auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
79 printf(
"copy done\n");
81 auto querySizes = std::vector<int>{1, 4, 16, 64, 256, 1024, 4096, 16384};
82 auto nprobeSizes = std::vector<int>{1, 4, 8, 16, 32, 64, 128, 256};
85 {(int) querySizes.size(), (int) nprobeSizes.size()});
87 {(int) querySizes.size(), (int) nprobeSizes.size()});
89 printf(
"GPU relative speedup over CPU (x):\n");
91 for (
auto q = 0; q < querySizes.size(); ++q) {
92 auto numQueries = querySizes[q];
95 for (
auto p = 0; p < nprobeSizes.size(); ++p) {
96 auto nprobe = nprobeSizes[q];
99 faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
104 index->nprobe = nprobe;
106 float cpuTime = 0.0f;
109 if (!FLAGS_use_precomputed) {
110 index->use_precomputed_table = 0;
112 index->search(numQueries, cpuQuery.data(),
113 k, resultDistances.data(), resultIndices.data());
115 cpuTimePerVector[q][p] = cpuTime / (float) numQueries;
118 gpuIndex.setNumProbes(nprobe);
121 gpuHostDistances({numQueries, k});
123 gpuHostIndices({numQueries, k});
128 CUDA_VERIFY(cudaDeviceSynchronize());
130 float gpuTime = 0.0f;
135 gpuIndex.getIndex()->search(cpuQuery.getSize(0),
138 gpuHostDistances.data(),
139 gpuHostIndices.data());
141 CUDA_VERIFY(cudaDeviceSynchronize());
145 gpuTimePerVector[q][p] = gpuTime / (float) numQueries;
152 printf(
"%.2f", cpuTime / gpuTime);
159 printf(
"CPU time per query vector (us):\n");
161 for (
int q = 0; q < cpuTimePerVector.getSize(0); ++q) {
164 for (
int p = 0; p < cpuTimePerVector.getSize(1); ++p) {
170 printf(
"%.1f", (
float) cpuTimePerVector[q][p] * 1000.0f);
177 printf(
"GPU time per query vector (us):\n");
179 for (
int q = 0; q < gpuTimePerVector.getSize(0); ++q) {
182 for (
int p = 0; p < gpuTimePerVector.getSize(1); ++p) {
188 printf(
"%.1f", (
float) gpuTimePerVector[q][p] * 1000.0f);
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
bool usePrecomputedTables
Index * read_index(FILE *f, bool try_mmap)
CPU wallclock elapsed timer.
bool useFloat16LookupTables
int device
GPU device on which the index is resident.
IndicesOptions indicesOptions
Index storage options for the GPU.