mirror of
https://github.com/facebookresearch/faiss.git
synced 2025-06-03 21:54:02 +08:00
Summary: This diff streamlines the code that collects results for brute force distance computations for the L2 / IP and range search / knn search combinations. It introduces a `ResultHandler` template class that abstracts what happens with the computed distances and ids. In addition to the heap result handler and the range search result handler, it introduces a reservoir result handler that improves the search speed for large k (>=100). Benchmark results (https://fb.quip.com/y0g1ACLEqJXx#OCaACA2Gm45) show that on small datasets (10k) search is 10-50% faster (improvements are larger for small k). There is room for improvement in the reservoir implementation, whose implementation is quite naive currently, but the diff is already useful in its current form. Experiments on precomputed db vector norms for L2 distance computations were not very concluding performance-wise, so the implementation is removed from IndexFlatL2. This diff also removes IndexL2BaseShift, which was never used. Pull Request resolved: https://github.com/facebookresearch/faiss/pull/1502 Test Plan: ``` buck test //faiss/tests/:test_product_quantizer buck test //faiss/tests/:test_index -- TestIndexFlat ``` Reviewed By: wickedfoo Differential Revision: D24705464 Pulled By: mdouze fbshipit-source-id: 270e10b19f3c89ed7b607ec30549aca0ac5027fe
693 lines
21 KiB
Python
693 lines
21 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
import numpy as np
|
|
|
|
import faiss
|
|
import unittest
|
|
|
|
from common import get_dataset_2
|
|
|
|
|
|
|
|
class TestClustering(unittest.TestCase):
|
|
|
|
def test_clustering(self):
|
|
d = 64
|
|
n = 1000
|
|
rs = np.random.RandomState(123)
|
|
x = rs.uniform(size=(n, d)).astype('float32')
|
|
|
|
x *= 10
|
|
|
|
km = faiss.Kmeans(d, 32, niter=10)
|
|
err32 = km.train(x)
|
|
|
|
# check that objective is decreasing
|
|
prev = 1e50
|
|
for o in km.obj:
|
|
self.assertGreater(prev, o)
|
|
prev = o
|
|
|
|
km = faiss.Kmeans(d, 64, niter=10)
|
|
err64 = km.train(x)
|
|
|
|
# check that 64 centroids give a lower quantization error than 32
|
|
self.assertGreater(err32, err64)
|
|
|
|
km = faiss.Kmeans(d, 32, niter=10, int_centroids=True)
|
|
err_int = km.train(x)
|
|
|
|
# check that integer centoids are not as good as float ones
|
|
self.assertGreater(err_int, err32)
|
|
self.assertTrue(np.all(km.centroids == np.floor(km.centroids)))
|
|
|
|
|
|
def test_nasty_clustering(self):
|
|
d = 2
|
|
rs = np.random.RandomState(123)
|
|
x = np.zeros((100, d), dtype='float32')
|
|
for i in range(5):
|
|
x[i * 20:i * 20 + 20] = rs.uniform(size=d)
|
|
|
|
# we have 5 distinct points but ask for 10 centroids...
|
|
km = faiss.Kmeans(d, 10, niter=10, verbose=True)
|
|
km.train(x)
|
|
|
|
def test_redo(self):
|
|
d = 64
|
|
n = 1000
|
|
|
|
rs = np.random.RandomState(123)
|
|
x = rs.uniform(size=(n, d)).astype('float32')
|
|
|
|
# make sure that doing 10 redos yields a better objective than just 1
|
|
|
|
clus = faiss.Clustering(d, 20)
|
|
clus.nredo = 1
|
|
clus.train(x, faiss.IndexFlatL2(d))
|
|
obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
|
|
|
|
clus = faiss.Clustering(d, 20)
|
|
clus.nredo = 10
|
|
clus.train(x, faiss.IndexFlatL2(d))
|
|
obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
|
|
|
|
self.assertGreater(obj1, obj10)
|
|
|
|
def test_redo_cosine(self):
|
|
# test redo with cosine distance (inner prod, so objectives are reversed)
|
|
d = 64
|
|
n = 1000
|
|
|
|
rs = np.random.RandomState(123)
|
|
x = rs.uniform(size=(n, d)).astype('float32')
|
|
faiss.normalize_L2(x)
|
|
|
|
# make sure that doing 10 redos yields a better objective than just 1
|
|
# for cosine distance, it is IP so higher is better
|
|
|
|
clus = faiss.Clustering(d, 20)
|
|
clus.nredo = 1
|
|
clus.train(x, faiss.IndexFlatIP(d))
|
|
obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
|
|
|
|
clus = faiss.Clustering(d, 20)
|
|
clus.nredo = 10
|
|
clus.train(x, faiss.IndexFlatIP(d))
|
|
obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj
|
|
|
|
self.assertGreater(obj10, obj1)
|
|
|
|
|
|
def test_1ptpercluster(self):
|
|
# https://github.com/facebookresearch/faiss/issues/842
|
|
X = np.random.randint(0, 1, (5, 10)).astype('float32')
|
|
k = 5
|
|
niter = 10
|
|
verbose = True
|
|
kmeans = faiss.Kmeans(X.shape[1], k, niter=niter, verbose=verbose)
|
|
kmeans.train(X)
|
|
l2_distances, I = kmeans.index.search(X, 1)
|
|
|
|
def test_weighted(self):
|
|
d = 32
|
|
sigma = 0.1
|
|
|
|
# Data is naturally clustered in 10 clusters.
|
|
# 5 clusters have 100 points
|
|
# 5 clusters have 10 points
|
|
# run k-means with 5 clusters
|
|
|
|
ccent = faiss.randn((10, d), 123)
|
|
faiss.normalize_L2(ccent)
|
|
x = [ccent[i] + sigma * faiss.randn((100, d), 1234 + i) for i in range(5)]
|
|
x += [ccent[i] + sigma * faiss.randn((10, d), 1234 + i) for i in range(5, 10)]
|
|
x = np.vstack(x)
|
|
|
|
clus = faiss.Clustering(d, 5)
|
|
index = faiss.IndexFlatL2(d)
|
|
clus.train(x, index)
|
|
cdis1, perm1 = index.search(ccent, 1)
|
|
|
|
# distance^2 of ground-truth centroids to clusters
|
|
cdis1_first = cdis1[:5].sum()
|
|
cdis1_last = cdis1[5:].sum()
|
|
|
|
# now assign weight 0.1 to the 5 first clusters and weight 10
|
|
# to the 5 last ones and re-run k-means
|
|
weights = np.ones(100 * 5 + 10 * 5, dtype='float32')
|
|
weights[:100 * 5] = 0.1
|
|
weights[100 * 5:] = 10
|
|
|
|
clus = faiss.Clustering(d, 5)
|
|
index = faiss.IndexFlatL2(d)
|
|
clus.train(x, index, weights=weights)
|
|
cdis2, perm2 = index.search(ccent, 1)
|
|
|
|
# distance^2 of ground-truth centroids to clusters
|
|
cdis2_first = cdis2[:5].sum()
|
|
cdis2_last = cdis2[5:].sum()
|
|
|
|
print(cdis1_first, cdis1_last)
|
|
print(cdis2_first, cdis2_last)
|
|
|
|
# with the new clustering, the last should be much (*2) closer
|
|
# to their centroids
|
|
self.assertGreater(cdis1_last, cdis1_first * 2)
|
|
self.assertGreater(cdis2_first, cdis2_last * 2)
|
|
|
|
def test_encoded(self):
|
|
d = 32
|
|
k = 5
|
|
xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
|
|
|
|
# make sure that training on a compressed then decompressed
|
|
# dataset gives the same result as decompressing on-the-fly
|
|
|
|
codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit)
|
|
codec.train(xt)
|
|
codes = codec.sa_encode(xt)
|
|
|
|
xt2 = codec.sa_decode(codes)
|
|
|
|
clus = faiss.Clustering(d, k)
|
|
# clus.verbose = True
|
|
clus.niter = 0
|
|
index = faiss.IndexFlatL2(d)
|
|
clus.train(xt2, index)
|
|
ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)
|
|
|
|
_, ref_errs = index.search(xt2, 1)
|
|
|
|
clus = faiss.Clustering(d, k)
|
|
# clus.verbose = True
|
|
clus.niter = 0
|
|
clus.decode_block_size = 120
|
|
index = faiss.IndexFlatL2(d)
|
|
clus.train_encoded(codes, codec, index)
|
|
new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)
|
|
|
|
_, new_errs = index.search(xt2, 1)
|
|
|
|
# It's the same operation, so should be bit-exact the same
|
|
self.assertTrue(np.all(ref_centroids == new_centroids))
|
|
|
|
def test_init(self):
|
|
d = 32
|
|
k = 5
|
|
xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
|
|
km = faiss.Kmeans(d, k, niter=4)
|
|
km.train(xt)
|
|
|
|
km2 = faiss.Kmeans(d, k, niter=4)
|
|
km2.train(xt, init_centroids=km.centroids)
|
|
|
|
# check that the initial objective is better for km2 than km
|
|
self.assertGreater(km.obj[0], km2.obj[0] * 1.01)
|
|
|
|
def test_stats(self):
|
|
d = 32
|
|
k = 5
|
|
xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
|
|
km = faiss.Kmeans(d, k, niter=4)
|
|
km.train(xt)
|
|
assert list(km.obj) == [st['obj'] for st in km.iteration_stats]
|
|
|
|
class TestPCA(unittest.TestCase):
|
|
|
|
def test_pca(self):
|
|
d = 64
|
|
n = 1000
|
|
np.random.seed(123)
|
|
x = np.random.random(size=(n, d)).astype('float32')
|
|
|
|
pca = faiss.PCAMatrix(d, 10)
|
|
pca.train(x)
|
|
y = pca.apply_py(x)
|
|
|
|
# check that energy per component is decreasing
|
|
column_norm2 = (y**2).sum(0)
|
|
|
|
prev = 1e50
|
|
for o in column_norm2:
|
|
self.assertGreater(prev, o)
|
|
prev = o
|
|
|
|
|
|
|
|
class TestRevSwigPtr(unittest.TestCase):
|
|
|
|
def test_rev_swig_ptr(self):
|
|
|
|
index = faiss.IndexFlatL2(4)
|
|
xb0 = np.vstack([
|
|
i * 10 + np.array([1, 2, 3, 4], dtype='float32')
|
|
for i in range(5)])
|
|
index.add(xb0)
|
|
xb = faiss.rev_swig_ptr(index.xb.data(), 4 * 5).reshape(5, 4)
|
|
self.assertEqual(np.abs(xb0 - xb).sum(), 0)
|
|
|
|
|
|
class TestException(unittest.TestCase):
|
|
|
|
def test_exception(self):
|
|
|
|
index = faiss.IndexFlatL2(10)
|
|
|
|
a = np.zeros((5, 10), dtype='float32')
|
|
b = np.zeros(5, dtype='int64')
|
|
|
|
# an unsupported operation for IndexFlat
|
|
self.assertRaises(
|
|
RuntimeError,
|
|
index.add_with_ids, a, b
|
|
)
|
|
# assert 'add_with_ids not implemented' in str(e)
|
|
|
|
def test_exception_2(self):
|
|
self.assertRaises(
|
|
RuntimeError,
|
|
faiss.index_factory, 12, 'IVF256,Flat,PQ8'
|
|
)
|
|
# assert 'could not parse' in str(e)
|
|
|
|
|
|
class TestMapLong2Long(unittest.TestCase):
|
|
|
|
def test_maplong2long(self):
|
|
keys = np.array([13, 45, 67], dtype=np.int64)
|
|
vals = np.array([3, 8, 2], dtype=np.int64)
|
|
|
|
m = faiss.MapLong2Long()
|
|
m.add(keys, vals)
|
|
|
|
assert np.all(m.search_multiple(keys) == vals)
|
|
|
|
assert m.search(12343) == -1
|
|
|
|
|
|
class TestOrthognalReconstruct(unittest.TestCase):
|
|
|
|
def test_recons_orthonormal(self):
|
|
lt = faiss.LinearTransform(20, 10, True)
|
|
rs = np.random.RandomState(10)
|
|
A, _ = np.linalg.qr(rs.randn(20, 20))
|
|
A = A[:10].astype('float32')
|
|
faiss.copy_array_to_vector(A.ravel(), lt.A)
|
|
faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
|
|
|
|
lt.set_is_orthonormal()
|
|
lt.is_trained = True
|
|
assert lt.is_orthonormal
|
|
|
|
x = rs.rand(30, 20).astype('float32')
|
|
xt = lt.apply_py(x)
|
|
xtt = lt.reverse_transform(xt)
|
|
xttt = lt.apply_py(xtt)
|
|
|
|
err = ((xt - xttt)**2).sum()
|
|
|
|
self.assertGreater(1e-5, err)
|
|
|
|
def test_recons_orthogona_impossible(self):
|
|
lt = faiss.LinearTransform(20, 10, True)
|
|
rs = np.random.RandomState(10)
|
|
A = rs.randn(10 * 20).astype('float32')
|
|
faiss.copy_array_to_vector(A.ravel(), lt.A)
|
|
faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b)
|
|
lt.is_trained = True
|
|
|
|
lt.set_is_orthonormal()
|
|
assert not lt.is_orthonormal
|
|
|
|
x = rs.rand(30, 20).astype('float32')
|
|
xt = lt.apply_py(x)
|
|
try:
|
|
lt.reverse_transform(xt)
|
|
except Exception:
|
|
pass
|
|
else:
|
|
self.assertFalse('should do an exception')
|
|
|
|
|
|
class TestMAdd(unittest.TestCase):
|
|
|
|
def test_1(self):
|
|
# try with dimensions that are multiples of 16 or not
|
|
rs = np.random.RandomState(123)
|
|
swig_ptr = faiss.swig_ptr
|
|
for dim in 16, 32, 20, 25:
|
|
for _repeat in 1, 2, 3, 4, 5:
|
|
a = rs.rand(dim).astype('float32')
|
|
b = rs.rand(dim).astype('float32')
|
|
c = np.zeros(dim, dtype='float32')
|
|
bf = rs.uniform(5.0) - 2.5
|
|
idx = faiss.fvec_madd_and_argmin(
|
|
dim, swig_ptr(a), bf, swig_ptr(b),
|
|
swig_ptr(c))
|
|
ref_c = a + b * bf
|
|
assert np.abs(c - ref_c).max() < 1e-5
|
|
assert idx == ref_c.argmin()
|
|
|
|
|
|
class TestNyFuncs(unittest.TestCase):
|
|
|
|
def test_l2(self):
|
|
rs = np.random.RandomState(123)
|
|
swig_ptr = faiss.swig_ptr
|
|
for d in 1, 2, 4, 8, 12, 16:
|
|
x = rs.rand(d).astype('float32')
|
|
for ny in 128, 129, 130:
|
|
print("d=%d ny=%d" % (d, ny))
|
|
y = rs.rand(ny, d).astype('float32')
|
|
ref = ((x - y) ** 2).sum(1)
|
|
new = np.zeros(ny, dtype='float32')
|
|
faiss.fvec_L2sqr_ny(swig_ptr(new), swig_ptr(x),
|
|
swig_ptr(y), d, ny)
|
|
assert np.abs(ref - new).max() < 1e-4
|
|
|
|
def test_IP(self):
|
|
# this one is not optimized with SIMD but just in case
|
|
rs = np.random.RandomState(123)
|
|
swig_ptr = faiss.swig_ptr
|
|
for d in 1, 2, 4, 8, 12, 16:
|
|
x = rs.rand(d).astype('float32')
|
|
for ny in 128, 129, 130:
|
|
print("d=%d ny=%d" % (d, ny))
|
|
y = rs.rand(ny, d).astype('float32')
|
|
ref = (x * y).sum(1)
|
|
new = np.zeros(ny, dtype='float32')
|
|
faiss.fvec_inner_products_ny(
|
|
swig_ptr(new), swig_ptr(x), swig_ptr(y), d, ny)
|
|
assert np.abs(ref - new).max() < 1e-4
|
|
|
|
|
|
class TestMatrixStats(unittest.TestCase):
|
|
|
|
def test_0s(self):
|
|
rs = np.random.RandomState(123)
|
|
m = rs.rand(40, 20).astype('float32')
|
|
m[5:10] = 0
|
|
comments = faiss.MatrixStats(m).comments
|
|
print(comments)
|
|
assert 'has 5 copies' in comments
|
|
assert '5 null vectors' in comments
|
|
|
|
def test_copies(self):
|
|
rs = np.random.RandomState(123)
|
|
m = rs.rand(40, 20).astype('float32')
|
|
m[::2] = m[1::2]
|
|
comments = faiss.MatrixStats(m).comments
|
|
print(comments)
|
|
assert '20 vectors are distinct' in comments
|
|
|
|
def test_dead_dims(self):
|
|
rs = np.random.RandomState(123)
|
|
m = rs.rand(40, 20).astype('float32')
|
|
m[:, 5:10] = 0
|
|
comments = faiss.MatrixStats(m).comments
|
|
print(comments)
|
|
assert '5 dimensions are constant' in comments
|
|
|
|
def test_rogue_means(self):
|
|
rs = np.random.RandomState(123)
|
|
m = rs.rand(40, 20).astype('float32')
|
|
m[:, 5:10] += 12345
|
|
comments = faiss.MatrixStats(m).comments
|
|
print(comments)
|
|
assert '5 dimensions are too large wrt. their variance' in comments
|
|
|
|
def test_normalized(self):
|
|
rs = np.random.RandomState(123)
|
|
m = rs.rand(40, 20).astype('float32')
|
|
faiss.normalize_L2(m)
|
|
comments = faiss.MatrixStats(m).comments
|
|
print(comments)
|
|
assert 'vectors are normalized' in comments
|
|
|
|
|
|
class TestScalarQuantizer(unittest.TestCase):
|
|
|
|
def test_8bit_equiv(self):
|
|
rs = np.random.RandomState(123)
|
|
for _it in range(20):
|
|
for d in 13, 16, 24:
|
|
x = np.floor(rs.rand(5, d) * 256).astype('float32')
|
|
x[0] = 0
|
|
x[1] = 255
|
|
|
|
# make sure to test extreme cases
|
|
x[2, 0] = 0
|
|
x[3, 0] = 255
|
|
x[2, 1] = 255
|
|
x[3, 1] = 0
|
|
|
|
ref_index = faiss.IndexScalarQuantizer(
|
|
d, faiss.ScalarQuantizer.QT_8bit)
|
|
ref_index.train(x[:2])
|
|
ref_index.add(x[2:3])
|
|
|
|
index = faiss.IndexScalarQuantizer(
|
|
d, faiss.ScalarQuantizer.QT_8bit_direct)
|
|
assert index.is_trained
|
|
index.add(x[2:3])
|
|
|
|
assert np.all(
|
|
faiss.vector_to_array(ref_index.codes) ==
|
|
faiss.vector_to_array(index.codes))
|
|
|
|
# Note that distances are not the same because ref_index
|
|
# reconstructs x as x + 0.5
|
|
D, I = index.search(x[3:], 1)
|
|
|
|
# assert D[0, 0] == Dref[0, 0]
|
|
# print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
|
|
assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
|
|
|
|
def test_6bit_equiv(self):
|
|
rs = np.random.RandomState(123)
|
|
for d in 3, 6, 8, 16, 36:
|
|
trainset = np.zeros((2, d), dtype='float32')
|
|
trainset[0, :] = 0
|
|
trainset[0, :] = 63
|
|
|
|
index = faiss.IndexScalarQuantizer(
|
|
d, faiss.ScalarQuantizer.QT_6bit)
|
|
index.train(trainset)
|
|
|
|
print('cs=', index.code_size)
|
|
|
|
x = rs.randint(64, size=(100, d)).astype('float32')
|
|
|
|
# verify encoder / decoder
|
|
index.add(x)
|
|
x2 = index.reconstruct_n(0, x.shape[0])
|
|
assert np.all(x == x2 - 0.5)
|
|
|
|
# verify AVX decoder (used only for search)
|
|
y = 63 * rs.rand(20, d).astype('float32')
|
|
|
|
D, I = index.search(y, 10)
|
|
for i in range(20):
|
|
for j in range(10):
|
|
dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
|
|
# print(dis, D[i, j])
|
|
assert abs(D[i, j] - dis) / dis < 1e-5
|
|
|
|
class TestRandom(unittest.TestCase):
|
|
|
|
def test_rand(self):
|
|
x = faiss.rand(2000)
|
|
assert np.all(x >= 0) and np.all(x < 1)
|
|
h, _ = np.histogram(x, np.arange(0, 1, 0.1))
|
|
assert h.min() > 160 and h.max() < 240
|
|
|
|
def test_randint(self):
|
|
x = faiss.randint(20000, vmax=100)
|
|
assert np.all(x >= 0) and np.all(x < 100)
|
|
c = np.bincount(x, minlength=100)
|
|
print(c)
|
|
assert c.max() - c.min() < 50 * 2
|
|
|
|
|
|
class TestPairwiseDis(unittest.TestCase):
|
|
|
|
def test_L2(self):
|
|
swig_ptr = faiss.swig_ptr
|
|
x = faiss.rand((100, 10), seed=1)
|
|
y = faiss.rand((200, 10), seed=2)
|
|
ix = faiss.randint(50, vmax=100)
|
|
iy = faiss.randint(50, vmax=200)
|
|
dis = np.empty(50, dtype='float32')
|
|
faiss.pairwise_indexed_L2sqr(
|
|
10, 50,
|
|
swig_ptr(x), swig_ptr(ix),
|
|
swig_ptr(y), swig_ptr(iy),
|
|
swig_ptr(dis))
|
|
|
|
for i in range(50):
|
|
assert np.allclose(
|
|
dis[i], ((x[ix[i]] - y[iy[i]]) ** 2).sum())
|
|
|
|
def test_IP(self):
|
|
swig_ptr = faiss.swig_ptr
|
|
x = faiss.rand((100, 10), seed=1)
|
|
y = faiss.rand((200, 10), seed=2)
|
|
ix = faiss.randint(50, vmax=100)
|
|
iy = faiss.randint(50, vmax=200)
|
|
dis = np.empty(50, dtype='float32')
|
|
faiss.pairwise_indexed_inner_product(
|
|
10, 50,
|
|
swig_ptr(x), swig_ptr(ix),
|
|
swig_ptr(y), swig_ptr(iy),
|
|
swig_ptr(dis))
|
|
|
|
for i in range(50):
|
|
assert np.allclose(
|
|
dis[i], np.dot(x[ix[i]], y[iy[i]]))
|
|
|
|
|
|
class TestSWIGWrap(unittest.TestCase):
|
|
""" various regressions with the SWIG wrapper """
|
|
|
|
def test_size_t_ptr(self):
|
|
# issue 1064
|
|
index = faiss.IndexHNSWFlat(10, 32)
|
|
|
|
hnsw = index.hnsw
|
|
index.add(np.random.rand(100, 10).astype('float32'))
|
|
be = np.empty(2, 'uint64')
|
|
hnsw.neighbor_range(23, 0, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
|
|
|
|
def test_id_map_at(self):
|
|
# issue 1020
|
|
n_features = 100
|
|
feature_dims = 10
|
|
|
|
features = np.random.random((n_features, feature_dims)).astype(np.float32)
|
|
idx = np.arange(n_features).astype(np.int64)
|
|
|
|
index = faiss.IndexFlatL2(feature_dims)
|
|
index = faiss.IndexIDMap2(index)
|
|
index.add_with_ids(features, idx)
|
|
|
|
[index.id_map.at(int(i)) for i in range(index.ntotal)]
|
|
|
|
def test_downcast_Refine(self):
|
|
|
|
index = faiss.IndexRefineFlat(
|
|
faiss.IndexScalarQuantizer(10, faiss.ScalarQuantizer.QT_8bit)
|
|
)
|
|
|
|
# serialize and deserialize
|
|
index2 = faiss.deserialize_index(
|
|
faiss.serialize_index(index)
|
|
)
|
|
|
|
assert isinstance(index2, faiss.IndexRefineFlat)
|
|
|
|
def do_test_array_type(self, dtype):
|
|
""" tests swig_ptr and rev_swig_ptr for this type of array """
|
|
a = np.arange(12).astype(dtype)
|
|
ptr = faiss.swig_ptr(a)
|
|
print(ptr)
|
|
a2 = faiss.rev_swig_ptr(ptr, 12)
|
|
np.testing.assert_array_equal(a, a2)
|
|
|
|
def test_all_array_types(self):
|
|
self.do_test_array_type('float32')
|
|
self.do_test_array_type('float64')
|
|
self.do_test_array_type('int8')
|
|
self.do_test_array_type('uint8')
|
|
self.do_test_array_type('int16')
|
|
self.do_test_array_type('uint16')
|
|
self.do_test_array_type('int32')
|
|
self.do_test_array_type('uint32')
|
|
self.do_test_array_type('int64')
|
|
self.do_test_array_type('uint64')
|
|
|
|
|
|
class TestPartitioning(unittest.TestCase):
|
|
|
|
def do_partition(self, n, q, maxval=None, seed=None):
|
|
if seed is None:
|
|
for i in range(50):
|
|
self.do_partition(n, q, maxval, i + 1234)
|
|
# print("seed=", seed)
|
|
rs = np.random.RandomState(seed)
|
|
if maxval is None:
|
|
vals = rs.rand(n).astype('float32')
|
|
else:
|
|
vals = rs.randint(maxval, size=n).astype('float32')
|
|
|
|
ids = (rs.permutation(n) + 12345).astype('int64')
|
|
dic = dict(zip(ids, vals))
|
|
|
|
#print("seed=", seed, "q=", q, "n=", n)
|
|
#print(vals)
|
|
#print(ids)
|
|
|
|
vals_orig = vals.copy()
|
|
|
|
sp = faiss.swig_ptr
|
|
if type(q) == int:
|
|
faiss.CMax_float_partition_fuzzy(
|
|
sp(vals), sp(ids), n,
|
|
q, q, None
|
|
)
|
|
else:
|
|
q_min, q_max = q
|
|
q = np.array([-1], dtype='uint64')
|
|
faiss.CMax_float_partition_fuzzy(
|
|
sp(vals), sp(ids), n,
|
|
q_min, q_max, sp(q)
|
|
)
|
|
q = q[0]
|
|
assert q_min <= q <= q_max
|
|
|
|
o = vals_orig.argsort()
|
|
thresh = vals_orig[o[q]]
|
|
n_eq = (vals_orig[o[:q]] == thresh).sum()
|
|
|
|
for i in range(q):
|
|
self.assertEqual(vals[i], dic[ids[i]])
|
|
self.assertLessEqual(vals[i], thresh)
|
|
if vals[i] == thresh:
|
|
n_eq -= 1
|
|
self.assertEqual(n_eq, 0)
|
|
|
|
def test_partition(self):
|
|
self.do_partition(160, 80)
|
|
|
|
def test_partition_manydups(self):
|
|
self.do_partition(160, 80, maxval=16)
|
|
|
|
def test_partition_lowq(self):
|
|
self.do_partition(160, 10, maxval=16)
|
|
|
|
def test_partition_highq(self):
|
|
self.do_partition(165, 155, maxval=16)
|
|
|
|
def test_partition_q10(self):
|
|
self.do_partition(32, 10, maxval=500)
|
|
|
|
def test_partition_q10_dups(self):
|
|
self.do_partition(32, 10, maxval=16)
|
|
|
|
def test_partition_q10_fuzzy(self):
|
|
self.do_partition(32, (10, 15), maxval=500)
|
|
|
|
def test_partition_fuzzy(self):
|
|
self.do_partition(160, (70, 80), maxval=500)
|
|
|
|
def test_partition_fuzzy_2(self):
|
|
self.do_partition(160, (70, 80))
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|