mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3154 Using the benchmark to find Pareto optimal indices, in this case on BigANN as an example. Separately optimize the coarse quantizer and the vector codec and use Pareto optimal configurations to construct IVF indices, which are then retested at various scales. See `optimize()` in `optimize.py` as the main function driving the process. The results can be interpreted with `bench_fw_notebook.ipynb`, which allows: * filtering by maximum code size * maximum time * minimum accuracy * space or time Pareto optimal options * and visualize the results and output them as a table. This version is intentionally limited to IVF(Flat|HNSW),PQ|SQ indices... Reviewed By: mdouze Differential Revision: D51781670 fbshipit-source-id: 2c0f800d374ea845255934f519cc28095c00a51f
334 lines
11 KiB
Python
334 lines
11 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
from typing import Dict, List, Tuple
|
|
|
|
import faiss # @manual=//faiss/python:pyfaiss_gpu
|
|
|
|
# from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib_gpu
|
|
# OperatingPoints,
|
|
# )
|
|
|
|
from .benchmark import Benchmark
|
|
from .descriptors import DatasetDescriptor, IndexDescriptor
|
|
from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class Optimizer:
|
|
distance_metric: str = "L2"
|
|
num_threads: int = 32
|
|
run_local: bool = True
|
|
|
|
def __post_init__(self):
|
|
self.cached_benchmark = None
|
|
if self.distance_metric == "IP":
|
|
self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
|
|
elif self.distance_metric == "L2":
|
|
self.distance_metric_type = faiss.METRIC_L2
|
|
else:
|
|
raise ValueError
|
|
|
|
def set_io(self, benchmark_io):
|
|
self.io = benchmark_io
|
|
self.io.distance_metric = self.distance_metric
|
|
self.io.distance_metric_type = self.distance_metric_type
|
|
|
|
def benchmark_and_filter_candidates(
|
|
self,
|
|
index_descs,
|
|
training_vectors,
|
|
database_vectors,
|
|
query_vectors,
|
|
result_file,
|
|
include_flat,
|
|
min_accuracy,
|
|
pareto_metric,
|
|
):
|
|
benchmark = Benchmark(
|
|
num_threads=self.num_threads,
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors,
|
|
query_vectors=query_vectors,
|
|
index_descs=index_descs,
|
|
k=10,
|
|
distance_metric=self.distance_metric,
|
|
)
|
|
benchmark.set_io(self.io)
|
|
results = benchmark.benchmark(
|
|
result_file=result_file, local=self.run_local, train=True, knn=True
|
|
)
|
|
assert results
|
|
filtered = filter_results(
|
|
results=results,
|
|
evaluation="knn",
|
|
accuracy_metric="knn_intersection",
|
|
min_accuracy=min_accuracy,
|
|
name_filter=None
|
|
if include_flat
|
|
else (lambda n: not n.startswith("Flat")),
|
|
pareto_mode=ParetoMode.GLOBAL,
|
|
pareto_metric=pareto_metric,
|
|
)
|
|
assert filtered
|
|
index_descs = [
|
|
IndexDescriptor(
|
|
factory=v["factory"],
|
|
construction_params=v["construction_params"],
|
|
search_params=v["search_params"],
|
|
)
|
|
for _, _, _, _, v in filtered
|
|
]
|
|
return index_descs, filtered
|
|
|
|
def optimize_quantizer(
|
|
self,
|
|
training_vectors: DatasetDescriptor,
|
|
query_vectors: DatasetDescriptor,
|
|
nlists: List[int],
|
|
min_accuracy: float,
|
|
):
|
|
quantizer_descs = {}
|
|
for nlist in nlists:
|
|
# cluster
|
|
centroids, _, _ = training_vectors.k_means(
|
|
self.io,
|
|
nlist,
|
|
dry_run=False,
|
|
)
|
|
|
|
descs = [IndexDescriptor(factory="Flat"),] + [
|
|
IndexDescriptor(
|
|
factory="HNSW32",
|
|
construction_params=[{"efConstruction": 2**i}],
|
|
)
|
|
for i in range(6, 11)
|
|
]
|
|
|
|
descs, _ = self.benchmark_and_filter_candidates(
|
|
descs,
|
|
training_vectors=centroids,
|
|
database_vectors=centroids,
|
|
query_vectors=query_vectors,
|
|
result_file=f"result_{centroids.get_filename()}json",
|
|
include_flat=True,
|
|
min_accuracy=min_accuracy,
|
|
pareto_metric=ParetoMetric.TIME,
|
|
)
|
|
quantizer_descs[nlist] = descs
|
|
|
|
return quantizer_descs
|
|
|
|
def optimize_ivf(
|
|
self,
|
|
result_file: str,
|
|
training_vectors: DatasetDescriptor,
|
|
database_vectors: DatasetDescriptor,
|
|
query_vectors: DatasetDescriptor,
|
|
quantizers: Dict[int, List[IndexDescriptor]],
|
|
codecs: List[Tuple[str, str]],
|
|
min_accuracy: float,
|
|
):
|
|
ivf_descs = []
|
|
for nlist, quantizer_descs in quantizers.items():
|
|
# build IVF index
|
|
for quantizer_desc in quantizer_descs:
|
|
for pretransform, fine_ivf in codecs:
|
|
if pretransform is None:
|
|
pretransform = ""
|
|
else:
|
|
pretransform = pretransform + ","
|
|
if quantizer_desc.construction_params is None:
|
|
construction_params = [
|
|
None,
|
|
quantizer_desc.search_params,
|
|
]
|
|
else:
|
|
construction_params = [
|
|
None
|
|
] + quantizer_desc.construction_params
|
|
if quantizer_desc.search_params is not None:
|
|
dict_merge(
|
|
construction_params[1],
|
|
quantizer_desc.search_params,
|
|
)
|
|
ivf_descs.append(
|
|
IndexDescriptor(
|
|
factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
|
|
construction_params=construction_params,
|
|
)
|
|
)
|
|
return self.benchmark_and_filter_candidates(
|
|
ivf_descs,
|
|
training_vectors,
|
|
database_vectors,
|
|
query_vectors,
|
|
result_file,
|
|
include_flat=False,
|
|
min_accuracy=min_accuracy,
|
|
pareto_metric=ParetoMetric.TIME_SPACE,
|
|
)
|
|
|
|
# train an IVFFlat index
|
|
# find the nprobe required for the given accuracy
|
|
def ivf_flat_nprobe_required_for_accuracy(
|
|
self,
|
|
result_file: str,
|
|
training_vectors: DatasetDescriptor,
|
|
database_vectors: DatasetDescriptor,
|
|
query_vectors: DatasetDescriptor,
|
|
nlist,
|
|
accuracy,
|
|
):
|
|
_, results = self.benchmark_and_filter_candidates(
|
|
index_descs=[
|
|
IndexDescriptor(factory=f"IVF{nlist}(Flat),Flat"),
|
|
],
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors,
|
|
query_vectors=query_vectors,
|
|
result_file=result_file,
|
|
include_flat=False,
|
|
min_accuracy=accuracy,
|
|
pareto_metric=ParetoMetric.TIME,
|
|
)
|
|
nprobe = nlist // 2
|
|
for _, _, _, k, v in results:
|
|
if (
|
|
".knn" in k
|
|
and "nprobe" in v["search_params"]
|
|
and v["knn_intersection"] >= accuracy
|
|
):
|
|
nprobe = min(nprobe, v["search_params"]["nprobe"])
|
|
return nprobe
|
|
|
|
# train candidate IVF codecs
|
|
# benchmark them at the same nprobe
|
|
# keep only the space _and_ time Pareto optimal
|
|
def optimize_codec(
|
|
self,
|
|
result_file: str,
|
|
d: int,
|
|
training_vectors: DatasetDescriptor,
|
|
database_vectors: DatasetDescriptor,
|
|
query_vectors: DatasetDescriptor,
|
|
nlist: int,
|
|
nprobe: int,
|
|
min_accuracy: float,
|
|
):
|
|
codecs = (
|
|
[
|
|
(None, "Flat"),
|
|
(None, "SQfp16"),
|
|
(None, "SQ8"),
|
|
] + [
|
|
(f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
|
|
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
|
|
if d % M == 0
|
|
for dim in range(2, 18, 2)
|
|
if M * dim <= d
|
|
for b in range(4, 14, 2)
|
|
if M * b < d * 8 # smaller than SQ8
|
|
] + [
|
|
(None, f"PQ{M}x{b}")
|
|
for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
|
|
if d % M == 0
|
|
for b in range(8, 14, 2)
|
|
if M * b < d * 8 # smaller than SQ8
|
|
]
|
|
)
|
|
factory = {}
|
|
for opq, pq in codecs:
|
|
factory[
|
|
f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}"
|
|
] = (
|
|
opq,
|
|
pq,
|
|
)
|
|
|
|
_, filtered = self.benchmark_and_filter_candidates(
|
|
index_descs=[
|
|
IndexDescriptor(
|
|
factory=f"IVF{nlist},{pq}"
|
|
if opq is None
|
|
else f"{opq},IVF{nlist},{pq}",
|
|
search_params={
|
|
"nprobe": nprobe,
|
|
},
|
|
)
|
|
for opq, pq in codecs
|
|
],
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors,
|
|
query_vectors=query_vectors,
|
|
result_file=result_file,
|
|
include_flat=False,
|
|
min_accuracy=min_accuracy,
|
|
pareto_metric=ParetoMetric.TIME_SPACE,
|
|
)
|
|
results = [
|
|
factory[r] for r in set(v["factory"] for _, _, _, k, v in filtered)
|
|
]
|
|
return results
|
|
|
|
def optimize(
|
|
self,
|
|
d: int,
|
|
training_vectors: DatasetDescriptor,
|
|
database_vectors_list: List[DatasetDescriptor],
|
|
query_vectors: DatasetDescriptor,
|
|
min_accuracy: float,
|
|
):
|
|
# train an IVFFlat index
|
|
# find the nprobe required for near perfect accuracy
|
|
nlist = 4096
|
|
nprobe_at_95 = self.ivf_flat_nprobe_required_for_accuracy(
|
|
result_file=f"result_ivf{nlist}_flat.json",
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors_list[0],
|
|
query_vectors=query_vectors,
|
|
nlist=nlist,
|
|
accuracy=0.95,
|
|
)
|
|
|
|
# train candidate IVF codecs
|
|
# benchmark them at the same nprobe
|
|
# keep only the space and time Pareto optima
|
|
codecs = self.optimize_codec(
|
|
result_file=f"result_ivf{nlist}_codec.json",
|
|
d=d,
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors_list[0],
|
|
query_vectors=query_vectors,
|
|
nlist=nlist,
|
|
nprobe=nprobe_at_95,
|
|
min_accuracy=min_accuracy,
|
|
)
|
|
|
|
# optimize coarse quantizers
|
|
quantizers = self.optimize_quantizer(
|
|
training_vectors=training_vectors,
|
|
query_vectors=query_vectors,
|
|
nlists=[4096, 8192, 16384, 32768],
|
|
min_accuracy=0.7,
|
|
)
|
|
|
|
# combine them with the codecs
|
|
# test them at different scales
|
|
for database_vectors in database_vectors_list:
|
|
self.optimize_ivf(
|
|
result_file=f"result_{database_vectors.get_filename()}json",
|
|
training_vectors=training_vectors,
|
|
database_vectors=database_vectors,
|
|
query_vectors=query_vectors,
|
|
quantizers=quantizers,
|
|
codecs=codecs,
|
|
min_accuracy=min_accuracy,
|
|
)
|