# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # Copyright (c) 2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import numpy as np import faiss import time import argparse import rmm ###################################################### # Command-line parsing ###################################################### parser = argparse.ArgumentParser() from datasets import load_sift1M, evaluate print("load data") xb, xq, xt, gt = load_sift1M() def aa(*args, **kwargs): group.add_argument(*args, **kwargs) group = parser.add_argument_group('benchmarking options') aa('--raft_only', default=False, action='store_true', help='whether to only produce RAFT enabled benchmarks') group = parser.add_argument_group('IVF options') aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled') aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code') aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)') group = parser.add_argument_group('searching') aa('--k', default=10, type=int, help='nb of nearest neighbors') aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe') args = parser.parse_args() print("args:", args) rs = np.random.RandomState(123) res = faiss.StandardGpuResources() # Use an RMM pool memory resource for device allocations mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource()) rmm.mr.set_current_device_resource(mr) # A heuristic to select a suitable number of lists def compute_nlist(numVecs): nlist = np.sqrt(numVecs) if (numVecs / nlist < 1000): nlist = numVecs / 1000 return int(nlist) def bench_train_milliseconds(index, trainVecs, use_raft): co = faiss.GpuMultipleClonerOptions() # use float 16 lookup tables to save space co.useFloat16LookupTables = True co.use_raft = use_raft index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) t0 = time.time() index_gpu.train(trainVecs) return 1000*(time.time() - t0) n_rows, n_cols = xb.shape n_train, _ = xt.shape M = n_cols // args.pq_len nlist = compute_nlist(n_rows) index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) print("=" * 40) print("GPU Train Benchmarks") print("=" * 40) raft_gpu_train_time = bench_train_milliseconds(index, xt, True) if args.raft_only: print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time)) else: classical_gpu_train_time = bench_train_milliseconds( index, xt, False) print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time)) def bench_add_milliseconds(index, addVecs, use_raft): co = faiss.GpuMultipleClonerOptions() # use float 16 lookup tables to save space co.useFloat16LookupTables = True co.use_raft = use_raft index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) index_gpu.copyFrom(index) t0 = time.time() index_gpu.add(addVecs) return 1000*(time.time() - t0) print("=" * 40) print("GPU Add Benchmarks") print("=" * 40) index.train(xt) raft_gpu_add_time = bench_add_milliseconds(index, xb, True) if args.raft_only: print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time)) else: classical_gpu_add_time = bench_add_milliseconds( index, xb, False) print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time)) def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft): co = faiss.GpuMultipleClonerOptions() co.use_raft = use_raft co.useFloat16LookupTables = True index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) index_gpu.copyFrom(index) index_gpu.add(addVecs) index_gpu.nprobe = nprobe t0 = time.time() index_gpu.search(queryVecs, k) return 1000*(time.time() - t0) if args.bm_search: print("=" * 40) print("GPU Search Benchmarks") print("=" * 40) queryset_sizes = [1, 10, 100, 1000, 10000] n_train, n_cols = xt.shape n_add, _ = xb.shape print(xq.shape) M = n_cols // args.pq_len nlist = compute_nlist(n_add) index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) index.train(xt) for n_rows in queryset_sizes: queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)] raft_gpu_search_time = bench_search_milliseconds( index, xb, queryVecs, args.nprobe, args.k, True) if args.raft_only: print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) else: classical_gpu_search_time = bench_search_milliseconds( index, xb, queryVecs, args.nprobe, args.k, False) print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % ( n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))