11 #include "../../IndexIVFPQ.h"
12 #include "../../index_io.h"
13 #include "../../utils.h"
15 #include "../GpuIndexIVFPQ.h"
16 #include "IndexWrapper.h"
17 #include "../test/TestUtils.h"
18 #include "../utils/DeviceTensor.cuh"
19 #include "../utils/DeviceUtils.h"
20 #include "../utils/HostTensor.cuh"
21 #include "../utils/Timer.h"
22 #include <gflags/gflags.h>
26 DEFINE_int32(k, 10,
"final number of closest results returned");
27 DEFINE_string(in,
"/home/jhj/local/ivfpq_index.out",
"index file for input");
28 DEFINE_bool(use_precomputed,
true,
"enable or disable precomputed codes");
29 DEFINE_bool(float16_lookup,
false,
"use float16 residual distance tables");
30 DEFINE_int32(num_gpus, 1,
"number of gpus to use");
31 DEFINE_int32(index, 2,
"0 = no indices on GPU; 1 = 32 bit, 2 = 64 bit on GPU");
33 using namespace faiss::gpu;
35 int main(
int argc,
char** argv) {
36 gflags::ParseCommandLineFlags(&argc, &argv,
true);
38 auto seed = time(
nullptr);
41 auto index = std::unique_ptr<faiss::IndexIVFPQ>(
43 FAISS_ASSERT((
bool) index);
46 auto codes = index->pq.M;
47 auto bitsPerCode = index->pq.nbits;
49 printf(
"Database: dim %d num vecs %ld\n", dim, index->ntotal);
50 printf(
"Coarse centroids: %ld\n", index->quantizer->ntotal);
51 printf(
"PQ centroids: codes %ld bits per code %ld\n", codes, bitsPerCode);
52 printf(
"L2 lookup: total k %d, precomputed codes %d\n\n",
53 k, FLAGS_use_precomputed);
56 printf(
"Copying index to %d GPU(s)...\n", FLAGS_num_gpus);
58 bool precomp = FLAGS_use_precomputed;
59 auto indicesOpt = (faiss::gpu::IndicesOptions) FLAGS_index;
60 auto useFloat16Lookup = FLAGS_float16_lookup;
62 auto initFn = [precomp, indicesOpt, useFloat16Lookup, &index]
64 std::unique_ptr<faiss::gpu::GpuIndexIVFPQ> {
72 auto p = std::unique_ptr<faiss::gpu::GpuIndexIVFPQ>(
79 printf(
"copy done\n");
81 auto querySizes = std::vector<int>{1, 4, 16, 64, 256, 1024, 4096, 16384};
82 auto nprobeSizes = std::vector<int>{1, 4, 8, 16, 32, 64, 128, 256};
85 {(int) querySizes.size(), (int) nprobeSizes.size()});
87 {(int) querySizes.size(), (int) nprobeSizes.size()});
89 printf(
"GPU relative speedup over CPU (x):\n");
91 for (
auto q = 0; q < querySizes.size(); ++q) {
92 auto numQueries = querySizes[q];
95 for (
auto p = 0; p < nprobeSizes.size(); ++p) {
96 auto nprobe = nprobeSizes[q];
99 faiss::float_rand(cpuQuery.data(), cpuQuery.numElements(), seed);
104 index->nprobe = nprobe;
106 float cpuTime = 0.0f;
109 if (!FLAGS_use_precomputed) {
110 index->use_precomputed_table = 0;
112 index->search(numQueries, cpuQuery.data(),
113 k, resultDistances.data(), resultIndices.data());
115 cpuTimePerVector[q][p] = cpuTime / (float) numQueries;
118 gpuIndex.setNumProbes(nprobe);
121 gpuHostDistances({numQueries, k});
123 gpuHostIndices({numQueries, k});
128 CUDA_VERIFY(cudaDeviceSynchronize());
130 float gpuTime = 0.0f;
135 gpuIndex.getIndex()->search(cpuQuery.getSize(0),
138 gpuHostDistances.data(),
139 gpuHostIndices.data());
141 CUDA_VERIFY(cudaDeviceSynchronize());
145 gpuTimePerVector[q][p] = gpuTime / (float) numQueries;
152 printf(
"%.2f", cpuTime / gpuTime);
159 printf(
"CPU time per query vector (us):\n");
161 for (
int q = 0; q < cpuTimePerVector.getSize(0); ++q) {
164 for (
int p = 0; p < cpuTimePerVector.getSize(1); ++p) {
170 printf(
"%.1f", (
float) cpuTimePerVector[q][p] * 1000.0f);
177 printf(
"GPU time per query vector (us):\n");
179 for (
int q = 0; q < gpuTimePerVector.getSize(0); ++q) {
182 for (
int p = 0; p < gpuTimePerVector.getSize(1); ++p) {
188 printf(
"%.1f", (
float) gpuTimePerVector[q][p] * 1000.0f);
float elapsedMilliseconds()
Returns elapsed time in milliseconds.
bool usePrecomputedTables
CPU wallclock elapsed timer.
bool useFloat16LookupTables
int device
GPU device on which the index is resident.
IndicesOptions indicesOptions
Index storage options for the GPU.